From 7891a73ef36f4ad7b71069b3c57694f85bb79454 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 20 Aug 2021 21:39:25 +0100 Subject: Move CPU/GPU files from Core/Runtime to the respective backend folders Legacy structure contained two libraries core/runtime with two backends in each. We reduce the core/runtime libraries to a single library thus merging the backend files Signed-off-by: Georgios Pinitas Change-Id: I69545765fe7a730368105cdbd067d3135ec7a174 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6155 Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins --- src/core/CL/CLHelpers.cpp | 4 +- src/core/CL/CLKernelLibrary.cpp | 2 +- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 2 +- src/core/cpu/ICpuKernel.h | 36 - src/core/cpu/kernels/CpuActivationKernel.cpp | 260 --- src/core/cpu/kernels/CpuActivationKernel.h | 75 - src/core/cpu/kernels/CpuAddKernel.cpp | 296 ---- src/core/cpu/kernels/CpuAddKernel.h | 84 - src/core/cpu/kernels/CpuCastKernel.cpp | 1367 ---------------- src/core/cpu/kernels/CpuCastKernel.h | 82 - src/core/cpu/kernels/CpuCol2ImKernel.cpp | 124 -- src/core/cpu/kernels/CpuCol2ImKernel.h | 87 - src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp | 211 --- src/core/cpu/kernels/CpuConcatenateBatchKernel.h | 73 - src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp | 207 --- src/core/cpu/kernels/CpuConcatenateDepthKernel.h | 81 - .../cpu/kernels/CpuConcatenateHeightKernel.cpp | 178 -- src/core/cpu/kernels/CpuConcatenateHeightKernel.h | 70 - src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp | 175 -- src/core/cpu/kernels/CpuConcatenateWidthKernel.h | 70 - .../CpuConvertFullyConnectedWeightsKernel.cpp | 113 -- .../CpuConvertFullyConnectedWeightsKernel.h | 76 - .../CpuConvertQuantizedSignednessKernel.cpp | 142 -- .../kernels/CpuConvertQuantizedSignednessKernel.h | 63 - src/core/cpu/kernels/CpuCopyKernel.cpp | 166 -- src/core/cpu/kernels/CpuCopyKernel.h | 67 - .../cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp | 950 ----------- .../cpu/kernels/CpuDepthwiseConv2dNativeKernel.h | 105 -- src/core/cpu/kernels/CpuDequantizeKernel.cpp | 400 ----- src/core/cpu/kernels/CpuDequantizeKernel.h | 63 - src/core/cpu/kernels/CpuDirectConv2dKernel.cpp | 1385 ---------------- src/core/cpu/kernels/CpuDirectConv2dKernel.h | 91 -- .../kernels/CpuDirectConv2dOutputStageKernel.cpp | 513 ------ .../cpu/kernels/CpuDirectConv2dOutputStageKernel.h | 85 - src/core/cpu/kernels/CpuElementwiseKernel.cpp | 454 ----- src/core/cpu/kernels/CpuElementwiseKernel.h | 222 --- src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp | 182 --- src/core/cpu/kernels/CpuElementwiseUnaryKernel.h | 81 - src/core/cpu/kernels/CpuFillKernel.cpp | 90 - src/core/cpu/kernels/CpuFillKernel.h | 60 - src/core/cpu/kernels/CpuFloorKernel.cpp | 177 -- src/core/cpu/kernels/CpuFloorKernel.h | 78 - .../cpu/kernels/CpuGemmInterleave4x4Kernel.cpp | 151 -- src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h | 80 - .../kernels/CpuGemmLowpMatrixMultiplyKernel.cpp | 1053 ------------ .../cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h | 80 - .../kernels/CpuGemmLowpMatrixReductionKernel.cpp | 396 ----- .../cpu/kernels/CpuGemmLowpMatrixReductionKernel.h | 157 -- .../CpuGemmLowpOffsetContributionKernel.cpp | 417 ----- .../kernels/CpuGemmLowpOffsetContributionKernel.h | 88 - ...GemmLowpOffsetContributionOutputStageKernel.cpp | 946 ----------- ...puGemmLowpOffsetContributionOutputStageKernel.h | 114 -- .../CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp | 326 ---- .../CpuGemmLowpQuantizeDownInt32ScaleKernel.h | 107 -- ...tizeDownInt32ToInt16ScaleByFixedPointKernel.cpp | 227 --- ...antizeDownInt32ToInt16ScaleByFixedPointKernel.h | 111 -- ...ntizeDownInt32ToInt8ScaleByFixedPointKernel.cpp | 239 --- ...uantizeDownInt32ToInt8ScaleByFixedPointKernel.h | 114 -- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 236 --- ...antizeDownInt32ToUint8ScaleByFixedPointKernel.h | 108 -- .../cpu/kernels/CpuGemmMatrixAdditionKernel.cpp | 200 --- src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h | 88 - .../cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp | 1174 ------------- src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h | 91 -- src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp | 137 -- src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h | 97 -- src/core/cpu/kernels/CpuIm2ColKernel.cpp | 448 ----- src/core/cpu/kernels/CpuIm2ColKernel.h | 123 -- src/core/cpu/kernels/CpuMulKernel.cpp | 1729 -------------------- src/core/cpu/kernels/CpuMulKernel.h | 148 -- src/core/cpu/kernels/CpuPermuteKernel.cpp | 301 ---- src/core/cpu/kernels/CpuPermuteKernel.h | 69 - src/core/cpu/kernels/CpuPool2dKernel.cpp | 516 ------ src/core/cpu/kernels/CpuPool2dKernel.h | 82 - src/core/cpu/kernels/CpuQuantizeKernel.cpp | 266 --- src/core/cpu/kernels/CpuQuantizeKernel.h | 89 - src/core/cpu/kernels/CpuReshapeKernel.cpp | 140 -- src/core/cpu/kernels/CpuReshapeKernel.h | 64 - src/core/cpu/kernels/CpuScaleKernel.cpp | 623 ------- src/core/cpu/kernels/CpuScaleKernel.h | 108 -- src/core/cpu/kernels/CpuSoftmaxKernel.cpp | 378 ----- src/core/cpu/kernels/CpuSoftmaxKernel.h | 111 -- src/core/cpu/kernels/CpuSubKernel.cpp | 201 --- src/core/cpu/kernels/CpuSubKernel.h | 84 - src/core/cpu/kernels/CpuTransposeKernel.cpp | 510 ------ src/core/cpu/kernels/CpuTransposeKernel.h | 63 - src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp | 170 -- src/core/cpu/kernels/CpuWeightsReshapeKernel.h | 91 -- src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp | 551 ------- src/core/cpu/kernels/CpuWinogradConv2dKernel.h | 575 ------- src/core/cpu/kernels/activation/list.h | 49 - src/core/cpu/kernels/activation/neon/fp16.cpp | 217 --- src/core/cpu/kernels/activation/neon/fp32.cpp | 212 --- src/core/cpu/kernels/activation/neon/qasymm8.cpp | 262 --- .../cpu/kernels/activation/neon/qasymm8_signed.cpp | 261 --- src/core/cpu/kernels/activation/neon/qsymm16.cpp | 138 -- src/core/cpu/kernels/activation/sve/fp16.cpp | 130 -- src/core/cpu/kernels/activation/sve/fp32.cpp | 131 -- src/core/cpu/kernels/activation/sve/qasymm8.cpp | 253 --- .../cpu/kernels/activation/sve/qasymm8_signed.cpp | 253 --- src/core/cpu/kernels/activation/sve/qsymm16.cpp | 120 -- src/core/cpu/kernels/add/neon/list.h | 143 -- src/core/cpu/kernels/add/neon/qasymm8.cpp | 209 --- src/core/cpu/kernels/add/neon/qasymm8_signed.cpp | 208 --- src/core/cpu/kernels/add/neon/qsymm16.cpp | 174 -- src/core/cpu/kernels/add/sve/impl.cpp | 139 -- src/core/cpu/kernels/add/sve/impl.h | 40 - src/core/cpu/kernels/add/sve/list.h | 51 - src/core/cpu/kernels/add/sve/qasymm8.cpp | 182 --- src/core/cpu/kernels/add/sve/qasymm8_signed.cpp | 181 -- src/core/cpu/kernels/add/sve/qsymm16.cpp | 156 -- .../assembly/CpuGemmAssemblyWrapperKernel.h | 126 -- src/core/cpu/kernels/assembly/arm_gemm.hpp | 190 --- .../kernels/assembly/arm_gemm_compute_iface.hpp | 130 -- src/core/cpu/kernels/assembly/arm_gemm_local.hpp | 31 - .../kernels/assembly/convolution_parameters.hpp | 65 - src/core/cpu/kernels/assembly/gemm_common.hpp | 236 --- src/core/cpu/kernels/assembly/ndrange.hpp | 199 --- .../kernels/elementwise/neon/elementwise_list.h | 486 ------ .../elementwise/neon/elementwise_quantized_list.h | 654 -------- .../elementwise/neon/elementwise_unary_list.h | 116 -- .../cpu/kernels/elementwise/sve/elementwise.cpp | 311 ---- .../cpu/kernels/elementwise/sve/elementwise_list.h | 171 -- .../elementwise/sve/elementwise_quantized_list.h | 366 ----- .../kernels/elementwise/sve/elementwise_unary.cpp | 113 -- .../elementwise/sve/elementwise_unary_list.h | 39 - src/core/cpu/kernels/floor/list.h | 41 - src/core/cpu/kernels/floor/neon/fp16.cpp | 64 - src/core/cpu/kernels/floor/neon/fp32.cpp | 61 - .../CpuDepthwiseConv2dAssemblyWrapperKernel.cpp | 359 ---- .../CpuDepthwiseConv2dAssemblyWrapperKernel.h | 120 -- .../internal/CpuPool2dAssemblyWrapperKernel.cpp | 279 ---- .../internal/CpuPool2dAssemblyWrapperKernel.h | 119 -- src/core/cpu/kernels/pool2d/neon/fp16.cpp | 317 ---- src/core/cpu/kernels/pool2d/neon/fp32.cpp | 314 ---- src/core/cpu/kernels/pool2d/neon/list.h | 97 -- src/core/cpu/kernels/pool2d/neon/nchw/all.cpp | 700 -------- src/core/cpu/kernels/pool2d/neon/qasymm8.cpp | 41 - .../cpu/kernels/pool2d/neon/qasymm8_signed.cpp | 41 - src/core/cpu/kernels/pool2d/neon/quantized.h | 863 ---------- src/core/cpu/kernels/scale/neon/fp16.cpp | 174 -- src/core/cpu/kernels/scale/neon/integer.cpp | 293 ---- src/core/cpu/kernels/scale/neon/list.h | 185 --- src/core/cpu/kernels/scale/neon/qasymm8.cpp | 145 -- src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp | 145 -- src/core/cpu/kernels/scale/sve/fp16.cpp | 176 -- src/core/cpu/kernels/scale/sve/fp32.cpp | 174 -- src/core/cpu/kernels/scale/sve/integer.cpp | 300 ---- src/core/cpu/kernels/scale/sve/list.h | 47 - src/core/cpu/kernels/scale/sve/qasymm8.cpp | 207 --- src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp | 207 --- src/core/cpu/kernels/softmax/impl/neon/list.h | 388 ----- src/core/cpu/kernels/softmax/impl/sve/impl.cpp | 185 --- src/core/cpu/kernels/softmax/impl/sve/list.h | 223 --- src/core/cpu/kernels/sub/neon/list.h | 159 -- src/core/cpu/kernels/sub/neon/qasymm8.cpp | 230 --- src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp | 229 --- src/core/cpu/kernels/sub/neon/qsymm16.cpp | 201 --- src/core/gpu/cl/ClCompileContext.h | 36 - src/core/gpu/cl/ClKernelLibrary.cpp | 1029 ------------ src/core/gpu/cl/ClKernelLibrary.h | 95 -- src/core/gpu/cl/IClKernel.h | 37 - src/core/gpu/cl/kernels/ClActivationKernel.cpp | 255 --- src/core/gpu/cl/kernels/ClActivationKernel.h | 71 - .../gpu/cl/kernels/ClBatchConcatenateKernel.cpp | 153 -- src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h | 74 - src/core/gpu/cl/kernels/ClCastKernel.cpp | 168 -- src/core/gpu/cl/kernels/ClCastKernel.h | 79 - src/core/gpu/cl/kernels/ClCol2ImKernel.cpp | 175 -- src/core/gpu/cl/kernels/ClCol2ImKernel.h | 89 - .../ClConvertFullyConnectedWeightsKernel.cpp | 124 -- .../kernels/ClConvertFullyConnectedWeightsKernel.h | 73 - src/core/gpu/cl/kernels/ClCopyKernel.cpp | 175 -- src/core/gpu/cl/kernels/ClCopyKernel.h | 69 - src/core/gpu/cl/kernels/ClCropKernel.cpp | 136 -- src/core/gpu/cl/kernels/ClCropKernel.h | 78 - .../gpu/cl/kernels/ClDepthConcatenateKernel.cpp | 139 -- src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h | 74 - src/core/gpu/cl/kernels/ClDequantizeKernel.cpp | 158 -- src/core/gpu/cl/kernels/ClDequantizeKernel.h | 64 - src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 672 -------- src/core/gpu/cl/kernels/ClDirectConv2dKernel.h | 89 - src/core/gpu/cl/kernels/ClElementwiseKernel.cpp | 525 ------ src/core/gpu/cl/kernels/ClElementwiseKernel.h | 200 --- .../gpu/cl/kernels/ClElementwiseUnaryKernel.cpp | 168 -- src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h | 65 - src/core/gpu/cl/kernels/ClFillKernel.cpp | 120 -- src/core/gpu/cl/kernels/ClFillKernel.h | 68 - src/core/gpu/cl/kernels/ClFloorKernel.cpp | 124 -- src/core/gpu/cl/kernels/ClFloorKernel.h | 64 - .../ClGemmLowpMatrixMultiplyNativeKernel.cpp | 335 ---- .../kernels/ClGemmLowpMatrixMultiplyNativeKernel.h | 81 - .../ClGemmLowpMatrixMultiplyReshapedKernel.cpp | 300 ---- .../ClGemmLowpMatrixMultiplyReshapedKernel.h | 90 - ...GemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp | 544 ------ ...ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h | 100 -- .../kernels/ClGemmLowpOffsetContributionKernel.cpp | 212 --- .../kernels/ClGemmLowpOffsetContributionKernel.h | 86 - ...GemmLowpOffsetContributionOutputStageKernel.cpp | 263 --- ...ClGemmLowpOffsetContributionOutputStageKernel.h | 90 - ...owpQuantizeDownInt32ScaleByFixedPointKernel.cpp | 160 -- ...mLowpQuantizeDownInt32ScaleByFixedPointKernel.h | 78 - ...GemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp | 160 -- ...ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h | 80 - .../ClGemmLowpQuantizeDownInt32ScaleKernel.cpp | 157 -- .../ClGemmLowpQuantizeDownInt32ScaleKernel.h | 80 - .../gpu/cl/kernels/ClGemmLowpReductionKernel.cpp | 219 --- .../gpu/cl/kernels/ClGemmLowpReductionKernel.h | 124 -- .../gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp | 538 ------ .../gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h | 88 - .../kernels/ClGemmMatrixMultiplyNativeKernel.cpp | 416 ----- .../cl/kernels/ClGemmMatrixMultiplyNativeKernel.h | 88 - .../kernels/ClGemmMatrixMultiplyReshapedKernel.cpp | 421 ----- .../kernels/ClGemmMatrixMultiplyReshapedKernel.h | 113 -- .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp | 443 ----- .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h | 104 -- .../cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp | 224 --- .../gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h | 78 - .../cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp | 175 -- .../gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h | 84 - .../gpu/cl/kernels/ClHeightConcatenateKernel.cpp | 132 -- .../gpu/cl/kernels/ClHeightConcatenateKernel.h | 71 - src/core/gpu/cl/kernels/ClIm2ColKernel.cpp | 431 ----- src/core/gpu/cl/kernels/ClIm2ColKernel.h | 106 -- src/core/gpu/cl/kernels/ClMulKernel.cpp | 439 ----- src/core/gpu/cl/kernels/ClMulKernel.h | 118 -- src/core/gpu/cl/kernels/ClPermuteKernel.cpp | 152 -- src/core/gpu/cl/kernels/ClPermuteKernel.h | 73 - src/core/gpu/cl/kernels/ClPool2dKernel.cpp | 509 ------ src/core/gpu/cl/kernels/ClPool2dKernel.h | 75 - src/core/gpu/cl/kernels/ClQuantizeKernel.cpp | 180 -- src/core/gpu/cl/kernels/ClQuantizeKernel.h | 69 - src/core/gpu/cl/kernels/ClReshapeKernel.cpp | 134 -- src/core/gpu/cl/kernels/ClReshapeKernel.h | 64 - src/core/gpu/cl/kernels/ClScaleKernel.cpp | 213 --- src/core/gpu/cl/kernels/ClScaleKernel.h | 70 - src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp | 365 ----- src/core/gpu/cl/kernels/ClSoftmaxKernel.h | 118 -- src/core/gpu/cl/kernels/ClTransposeKernel.cpp | 124 -- src/core/gpu/cl/kernels/ClTransposeKernel.h | 64 - src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp | 164 -- src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h | 93 -- .../kernels/ClWidthConcatenate2TensorsKernel.cpp | 159 -- .../cl/kernels/ClWidthConcatenate2TensorsKernel.h | 67 - .../kernels/ClWidthConcatenate4TensorsKernel.cpp | 185 --- .../cl/kernels/ClWidthConcatenate4TensorsKernel.h | 70 - .../gpu/cl/kernels/ClWidthConcatenateKernel.cpp | 127 -- src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h | 68 - .../cl/kernels/ClWinogradFilterTransformKernel.cpp | 156 -- .../cl/kernels/ClWinogradFilterTransformKernel.h | 77 - .../cl/kernels/ClWinogradInputTransformKernel.cpp | 278 ---- .../cl/kernels/ClWinogradInputTransformKernel.h | 87 - .../cl/kernels/ClWinogradOutputTransformKernel.cpp | 268 --- .../cl/kernels/ClWinogradOutputTransformKernel.h | 85 - src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp | 116 -- src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h | 95 -- src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h | 123 -- .../native/ClGemmDefaultConfigNativeBifrost.cpp | 246 --- .../gemm/native/ClGemmDefaultConfigNativeBifrost.h | 62 - .../native/ClGemmDefaultConfigNativeMidgard.cpp | 73 - .../gemm/native/ClGemmDefaultConfigNativeMidgard.h | 57 - .../native/ClGemmDefaultConfigNativeValhall.cpp | 168 -- .../gemm/native/ClGemmDefaultConfigNativeValhall.h | 59 - .../kernels/gemm/native/ClGemmNativeKernelConfig.h | 71 - .../ClGemmDefaultConfigReshapedBifrost.cpp | 356 ---- .../reshaped/ClGemmDefaultConfigReshapedBifrost.h | 64 - .../ClGemmDefaultConfigReshapedValhall.cpp | 538 ------ .../reshaped/ClGemmDefaultConfigReshapedValhall.h | 61 - .../gemm/reshaped/ClGemmReshapedKernelConfig.h | 69 - .../ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp | 547 ------- .../ClGemmDefaultConfigReshapedRhsOnlyBifrost.h | 68 - .../ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp | 570 ------- .../ClGemmDefaultConfigReshapedRhsOnlyValhall.h | 61 - .../ClGemmReshapedOnlyRhsKernelConfig.h | 69 - src/core/utils/AssemblyUtils.h | 2 +- 275 files changed, 5 insertions(+), 57288 deletions(-) delete mode 100644 src/core/cpu/ICpuKernel.h delete mode 100644 src/core/cpu/kernels/CpuActivationKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuActivationKernel.h delete mode 100644 src/core/cpu/kernels/CpuAddKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuAddKernel.h delete mode 100644 src/core/cpu/kernels/CpuCastKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuCastKernel.h delete mode 100644 src/core/cpu/kernels/CpuCol2ImKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuCol2ImKernel.h delete mode 100644 src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuConcatenateBatchKernel.h delete mode 100644 src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuConcatenateDepthKernel.h delete mode 100644 src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuConcatenateHeightKernel.h delete mode 100644 src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuConcatenateWidthKernel.h delete mode 100644 src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h delete mode 100644 src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h delete mode 100644 src/core/cpu/kernels/CpuCopyKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuCopyKernel.h delete mode 100644 src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h delete mode 100644 src/core/cpu/kernels/CpuDequantizeKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuDequantizeKernel.h delete mode 100644 src/core/cpu/kernels/CpuDirectConv2dKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuDirectConv2dKernel.h delete mode 100644 src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h delete mode 100644 src/core/cpu/kernels/CpuElementwiseKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuElementwiseKernel.h delete mode 100644 src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuElementwiseUnaryKernel.h delete mode 100644 src/core/cpu/kernels/CpuFillKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuFillKernel.h delete mode 100644 src/core/cpu/kernels/CpuFloorKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuFloorKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h delete mode 100644 src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h delete mode 100644 src/core/cpu/kernels/CpuIm2ColKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuIm2ColKernel.h delete mode 100644 src/core/cpu/kernels/CpuMulKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuMulKernel.h delete mode 100644 src/core/cpu/kernels/CpuPermuteKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuPermuteKernel.h delete mode 100644 src/core/cpu/kernels/CpuPool2dKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuPool2dKernel.h delete mode 100644 src/core/cpu/kernels/CpuQuantizeKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuQuantizeKernel.h delete mode 100644 src/core/cpu/kernels/CpuReshapeKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuReshapeKernel.h delete mode 100644 src/core/cpu/kernels/CpuScaleKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuScaleKernel.h delete mode 100644 src/core/cpu/kernels/CpuSoftmaxKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuSoftmaxKernel.h delete mode 100644 src/core/cpu/kernels/CpuSubKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuSubKernel.h delete mode 100644 src/core/cpu/kernels/CpuTransposeKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuTransposeKernel.h delete mode 100644 src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuWeightsReshapeKernel.h delete mode 100644 src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuWinogradConv2dKernel.h delete mode 100644 src/core/cpu/kernels/activation/list.h delete mode 100644 src/core/cpu/kernels/activation/neon/fp16.cpp delete mode 100644 src/core/cpu/kernels/activation/neon/fp32.cpp delete mode 100644 src/core/cpu/kernels/activation/neon/qasymm8.cpp delete mode 100644 src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp delete mode 100644 src/core/cpu/kernels/activation/neon/qsymm16.cpp delete mode 100644 src/core/cpu/kernels/activation/sve/fp16.cpp delete mode 100644 src/core/cpu/kernels/activation/sve/fp32.cpp delete mode 100644 src/core/cpu/kernels/activation/sve/qasymm8.cpp delete mode 100644 src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp delete mode 100644 src/core/cpu/kernels/activation/sve/qsymm16.cpp delete mode 100644 src/core/cpu/kernels/add/neon/list.h delete mode 100644 src/core/cpu/kernels/add/neon/qasymm8.cpp delete mode 100644 src/core/cpu/kernels/add/neon/qasymm8_signed.cpp delete mode 100644 src/core/cpu/kernels/add/neon/qsymm16.cpp delete mode 100644 src/core/cpu/kernels/add/sve/impl.cpp delete mode 100644 src/core/cpu/kernels/add/sve/impl.h delete mode 100644 src/core/cpu/kernels/add/sve/list.h delete mode 100644 src/core/cpu/kernels/add/sve/qasymm8.cpp delete mode 100644 src/core/cpu/kernels/add/sve/qasymm8_signed.cpp delete mode 100644 src/core/cpu/kernels/add/sve/qsymm16.cpp delete mode 100644 src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h delete mode 100644 src/core/cpu/kernels/assembly/arm_gemm.hpp delete mode 100644 src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp delete mode 100644 src/core/cpu/kernels/assembly/arm_gemm_local.hpp delete mode 100644 src/core/cpu/kernels/assembly/convolution_parameters.hpp delete mode 100644 src/core/cpu/kernels/assembly/gemm_common.hpp delete mode 100644 src/core/cpu/kernels/assembly/ndrange.hpp delete mode 100644 src/core/cpu/kernels/elementwise/neon/elementwise_list.h delete mode 100644 src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h delete mode 100644 src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise.cpp delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_list.h delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h delete mode 100644 src/core/cpu/kernels/floor/list.h delete mode 100644 src/core/cpu/kernels/floor/neon/fp16.cpp delete mode 100644 src/core/cpu/kernels/floor/neon/fp32.cpp delete mode 100644 src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp delete mode 100644 src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h delete mode 100644 src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp delete mode 100644 src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h delete mode 100644 src/core/cpu/kernels/pool2d/neon/fp16.cpp delete mode 100644 src/core/cpu/kernels/pool2d/neon/fp32.cpp delete mode 100644 src/core/cpu/kernels/pool2d/neon/list.h delete mode 100644 src/core/cpu/kernels/pool2d/neon/nchw/all.cpp delete mode 100644 src/core/cpu/kernels/pool2d/neon/qasymm8.cpp delete mode 100644 src/core/cpu/kernels/pool2d/neon/qasymm8_signed.cpp delete mode 100644 src/core/cpu/kernels/pool2d/neon/quantized.h delete mode 100644 src/core/cpu/kernels/scale/neon/fp16.cpp delete mode 100644 src/core/cpu/kernels/scale/neon/integer.cpp delete mode 100644 src/core/cpu/kernels/scale/neon/list.h delete mode 100644 src/core/cpu/kernels/scale/neon/qasymm8.cpp delete mode 100644 src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp delete mode 100644 src/core/cpu/kernels/scale/sve/fp16.cpp delete mode 100644 src/core/cpu/kernels/scale/sve/fp32.cpp delete mode 100644 src/core/cpu/kernels/scale/sve/integer.cpp delete mode 100644 src/core/cpu/kernels/scale/sve/list.h delete mode 100644 src/core/cpu/kernels/scale/sve/qasymm8.cpp delete mode 100644 src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp delete mode 100644 src/core/cpu/kernels/softmax/impl/neon/list.h delete mode 100644 src/core/cpu/kernels/softmax/impl/sve/impl.cpp delete mode 100644 src/core/cpu/kernels/softmax/impl/sve/list.h delete mode 100644 src/core/cpu/kernels/sub/neon/list.h delete mode 100644 src/core/cpu/kernels/sub/neon/qasymm8.cpp delete mode 100644 src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp delete mode 100644 src/core/cpu/kernels/sub/neon/qsymm16.cpp delete mode 100644 src/core/gpu/cl/ClCompileContext.h delete mode 100644 src/core/gpu/cl/ClKernelLibrary.cpp delete mode 100644 src/core/gpu/cl/ClKernelLibrary.h delete mode 100644 src/core/gpu/cl/IClKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClActivationKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClActivationKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClCastKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClCastKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClCol2ImKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClCol2ImKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClCopyKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClCopyKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClCropKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClCropKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClDequantizeKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClDequantizeKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClDirectConv2dKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClElementwiseKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClElementwiseKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClFillKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClFillKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClFloorKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClFloorKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClIm2ColKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClIm2ColKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClMulKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClMulKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClPermuteKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClPermuteKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClPool2dKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClPool2dKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClQuantizeKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClQuantizeKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClReshapeKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClReshapeKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClScaleKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClScaleKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClSoftmaxKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClTransposeKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClTransposeKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h delete mode 100644 src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp delete mode 100644 src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h delete mode 100644 src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h (limited to 'src/core') diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 5c53455eeb..10ccc4f9a4 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -27,9 +27,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Log.h" #include "arm_compute/core/Types.h" -#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/ClKernelLibrary.h" +#include "src/gpu/cl/ClKernelLibrary.h" #include #include diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index d8983fcae9..c5a0796c3a 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/Error.h" -#include "src/core/gpu/cl/ClKernelLibrary.h" +#include "src/gpu/cl/ClKernelLibrary.h" #include #include #include diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 1437b5bebb..2b74f91a05 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -34,9 +34,9 @@ #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/CL/ICLKernel.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "support/StringSupport.h" namespace arm_compute diff --git a/src/core/cpu/ICpuKernel.h b/src/core/cpu/ICpuKernel.h deleted file mode 100644 index 650b3a7d0b..0000000000 --- a/src/core/cpu/ICpuKernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_ICPUKERNEL_H -#define ARM_COMPUTE_ICPUKERNEL_H - -#include "arm_compute/core/CPP/ICPPKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -using ICpuKernel = arm_compute::ICPPKernel; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_ICPUKERNEL_H */ diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp deleted file mode 100644 index dad2ecfc5b..0000000000 --- a/src/core/cpu/kernels/CpuActivationKernel.cpp +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuActivationKernel.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "src/core/CPP/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/activation/list.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -struct ActivationSelectorData -{ - DataType dt; - const CPUInfo &ci; -}; - -using ActivationSelectorPtr = std::add_pointer::type; -using ActivationKernelPtr = std::add_pointer::type; - -struct ActivationKernel -{ - const char *name; - const ActivationSelectorPtr is_selected; - ActivationKernelPtr ukernel; -}; - -static const ActivationKernel available_kernels[] = -{ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp16_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation) - }, - { - "sve_fp32_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_fp16_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation) - }, - { - "neon_fp32_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ -#if defined(ARM_COMPUTE_ENABLE_SVE2) - { - "sve_qu8_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation) - }, - { - "sve_qs8_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation) - }, - { - "sve_qs16_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16 && data.ci.has_sve2(); }, - REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ - { - "neon_qu8_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation) - }, - { - "neon_qs8_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation) - }, - { - "neon_qs16_activation", - [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation) - }, -}; - -const ActivationKernel *get_implementation(const ActivationSelectorData &data) -{ - for(const auto &uk : available_kernels) - { - if(uk.is_selected(data)) - { - return &uk; - } - } - return nullptr; -} - -/* Supported activation in the 8-bit integer domain */ -static const std::array qasymm8_activations = -{ - ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH, - ActivationLayerInfo::ActivationFunction::LEAKY_RELU, -}; -/* Supported activation in the 16-bit integer domain */ -static const std::array qsymm16_activations = -{ - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH -}; - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32); - - const auto *uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() }); - ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - const DataType data_type = src->data_type(); - const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); - const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)), - "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)), - "For QSYMM16 only tanh and logistic are supported"); - ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) - && (oq_info != QuantizationInfo(1.f / 128.f, 128))); - ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - && (oq_info != QuantizationInfo(1.f / 256.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128))); - - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - - // Checks performed when dst is configured - if((dst != nullptr) && (dst->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - return Status{}; -} - -std::pair validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) -{ - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - - if(dst != nullptr) - { - // dst auto inizialitation if not yet initialized - auto_init_if_empty(*dst, *src->clone()); - } - - return std::make_pair(Status{}, win); -} -} // namespace - -void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info)); - - const auto uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() }); - ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - - _act_info = activation_info; - _run_method = uk->ukernel; - _name = std::string("CpuActivationKernel").append("/").append(uk->name); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICPPKernel::configure(win_config.second); -} - -Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first); - - return Status{}; -} - -void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - // Early exit on disabled activation - if(!_act_info.enabled()) - { - return; - } - - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - ARM_COMPUTE_ERROR_ON(tensors.empty()); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - - _run_method(src, dst, _act_info, window); -} - -const char *CpuActivationKernel::name() const -{ - return _name.c_str(); -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuActivationKernel.h b/src/core/cpu/kernels/CpuActivationKernel.h deleted file mode 100644 index 37650345fe..0000000000 --- a/src/core/cpu/kernels/CpuActivationKernel.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H -#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the activation kernel */ -class CpuActivationKernel : public ICpuKernel -{ -public: - CpuActivationKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernel); - /** Configure kernel for a given list of arguments - * - * @note If the output tensor is a nullptr, the activation function will be performed in-place - * - * @param[in, out] src Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result - * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. - * @param[out] dst Destination tensor info. Data type supported: same as @p src - * @param[in] activation_info Activation layer information. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuActivationKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - using ActivationKernelPtr = std::add_pointer::type; - -private: - ActivationLayerInfo _act_info{}; - ActivationKernelPtr _run_method{ nullptr }; - std::string _name{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp deleted file mode 100644 index 61b7b19443..0000000000 --- a/src/core/cpu/kernels/CpuAddKernel.cpp +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuAddKernel.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/add/neon/list.h" -#include "src/core/cpu/kernels/add/sve/list.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -struct AddSelectorData -{ - DataType dt; - const CPUInfo &ci; -}; - -using AddSelectorPtr = std::add_pointer::type; -using AddKernelPtr = std::add_pointer::type; -struct AddKernel -{ - const char *name; - const AddSelectorPtr is_selected; - AddKernelPtr ukernel; -}; - -static const AddKernel available_kernels[] = -{ -#if defined(ARM_COMPUTE_ENABLE_SVE2) - { - "sve2_qu8_add", - [](const AddSelectorData & data) - { - return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); - }, - REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve) - }, - { - "sve2_qs8_add", - [](const AddSelectorData & data) - { - return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); - }, - REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve) - }, - { - "sve2_qs16_add", - [](const AddSelectorData & data) - { - return (data.dt == DataType::QSYMM16) && data.ci.has_sve(); - }, - REGISTER_QSYMM16_SVE(arm_compute::cpu::add_qsymm16_sve) - }, -#endif /* !defined(ARM_COMPUTE_ENABLE_SVE2) */ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp32_add", - [](const AddSelectorData & data) - { - return (data.dt == DataType::F32) && data.ci.has_sve(); - }, - REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve) - }, - { - "sve_fp16_add", - [](const AddSelectorData & data) - { - return (data.dt == DataType::F16) && data.ci.has_sve(); - }, - REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve) - }, - { - "sve_u8_add", - [](const AddSelectorData & data) - { - return (data.dt == DataType::U8) && data.ci.has_sve(); - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve) - }, - { - "sve_s16_add", - [](const AddSelectorData & data) - { - return (data.dt == DataType::S16) && data.ci.has_sve(); - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve) - }, - { - "sve_s32_add", - [](const AddSelectorData & data) - { - return (data.dt == DataType::S32) && data.ci.has_sve(); - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_fp32_add", - [](const AddSelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon) - }, -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_add", - [](const AddSelectorData & data) - { - return (data.dt == DataType::F16) && data.ci.has_fp16(); - }, - REGISTER_FP16_NEON(arm_compute::cpu::add_same_neon) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_u8_add", - [](const AddSelectorData & data) { return (data.dt == DataType::U8); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon) - }, - { - "neon_s16_add", - [](const AddSelectorData & data) { return (data.dt == DataType::S16); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon) - }, - { - "neon_s32_add", - [](const AddSelectorData & data) { return (data.dt == DataType::S32); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) - { - "neon_qu8_add", - [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon) - }, - { - "neon_qs8_add", - [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon) - }, - { - "neon_qs16_add", - [](const AddSelectorData & data) { return (data.dt == DataType::QSYMM16); }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ -}; - -/** Micro-kernel selector - * - * @param[in] data Selection data passed to help pick the appropriate micro-kernel - * - * @return A matching micro-kernel else nullptr - */ -const AddKernel *get_implementation(const CPUInfo &cpuinfo, DataType dt) -{ - for(const auto &uk : available_kernels) - { - if(uk.is_selected({ dt, cpuinfo })) - { - return &uk; - } - } - return nullptr; -} - -Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) -{ - ARM_COMPUTE_UNUSED(policy); - - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); - - const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) - || (src1.data_type() != dst.data_type())), - "Broadcasting across width is supported on configurations where all tensors have the same data type"); - - // Validate in case of configured dst - if(dst.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), - "Wrong shape for dst"); - } - - const auto *uk = get_implementation(CPUInfo::get(), src0.data_type()); - ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - return Status{}; -} - -std::pair validate_and_configure_window(const ITensorInfo &src0, const ITensorInfo &src1, ITensorInfo &dst) -{ - const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); - - // Auto initialize dst if not initialized - set_shape_if_empty(dst, out_shape); - set_data_type_if_unknown(dst, src0.data_type()); - - Window win = calculate_max_window(out_shape, Steps()); - - // CpuAddKernel doesn't need padding so update_window_and_padding() can be skipped - return std::make_pair(Status{}, win); -} -} // namespace - -void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); - - const auto uk = get_implementation(CPUInfo::get(), src0->data_type()); - ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - - _policy = policy; - _run_method = uk->ukernel; - _name = std::string("CpuAddKernel").append("/").append(uk->name); - - // Configure kernel window - auto win_config = validate_and_configure_window(*src0, *src1, *dst); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICpuKernel::configure(win_config.second); -} - -Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*src0->clone(), *src1->clone(), *dst->clone()).first); - - return Status{}; -} - -void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - ARM_COMPUTE_ERROR_ON(tensors.empty()); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); - const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - - _run_method(src0, src1, dst, _policy, window); -} - -const char *CpuAddKernel::name() const -{ - return _name.c_str(); -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuAddKernel.h b/src/core/cpu/kernels/CpuAddKernel.h deleted file mode 100644 index 1205b45dfb..0000000000 --- a/src/core/cpu/kernels/CpuAddKernel.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ADD_KERNEL_H -#define ARM_COMPUTE_CPU_ADD_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the kernel to perform addition between two tensors */ -class CpuAddKernel : public ICpuKernel -{ -public: - CpuAddKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddKernel); - /** Initialise the kernel's input, dst and border mode. - * - * Valid configurations (src0,src1) -> dst : - * - * - (U8,U8) -> U8 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] policy Overflow policy. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuAddKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - using AddKernelPtr = std::add_pointer::type; - -private: - ConvertPolicy _policy{}; - AddKernelPtr _run_method{ nullptr }; - std::string _name{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ADD_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuCastKernel.cpp b/src/core/cpu/kernels/CpuCastKernel.cpp deleted file mode 100644 index 46f3c330ef..0000000000 --- a/src/core/cpu/kernels/CpuCastKernel.cpp +++ /dev/null @@ -1,1367 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuCastKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/SaturateCast.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst); - ARM_COMPUTE_UNUSED(policy); - ARM_COMPUTE_RETURN_ERROR_ON(src == dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, - DataType::F32, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, - DataType::U32, DataType::S32, DataType::F32); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 - && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), - "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), - "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), - "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32), - "Only data_types supported [in] U16 -> [out] U8, U32"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32), - "Only data_types supported [in] S16 -> [out] U8, S32"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32, - "Only data_types supported [in] BFLOAT16 -> [out] F32"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::U8 - && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32), - "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8), - "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::F16 - && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8), - "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8"); - - // Validate in case of configured dst - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} -} // namespace - -void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given) - set_shape_if_empty(*dst, src->tensor_shape()); - - _policy = policy; - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy)); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - - ICPPKernel::configure(win); -} - -Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy)); - return Status{}; -} - -void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const int window_step_x = 16; - - const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC); - ITensor *_dst = tensors.get_tensor(TensorType::ACL_DST); - ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - ARM_COMPUTE_ERROR_ON(_src == _dst); - - ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator src(_src, win); - Iterator dst(_dst, win); - - switch(_src->info()->data_type()) - { - case DataType::QASYMM8_SIGNED: - { - switch(_dst->info()->data_type()) - { - case DataType::S16: - { - /* Up-conversion QASYMM8_SIGNED -> S16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - - const int16x8x2_t texels = - { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - - vst1q_s16(dst_ptr + x, texels.val[0]); - vst1q_s16(dst_ptr + x + 8, texels.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - case DataType::S32: - { - /* Up-conversion QASYMM8_SIGNED -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - - const int16x8x2_t texels = - { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - - vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - case DataType::F32: - { - /* Up-conversion QASYMM8_SIGNED -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast(src.ptr())); - - const int16x8x2_t texels = - { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - /* Up-conversion QASYMM8_SIGNED -> F16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - - const int16x8x2_t texels = - { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - - default: - ARM_COMPUTE_ERROR("dst data type not supported"); - } - break; - } - - case DataType::QASYMM8: - case DataType::U8: - { - switch(_dst->info()->data_type()) - { - case DataType::S16: - { - /* Up-conversion U8 -> S16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - - const int16x8x2_t texels = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - - vst1q_s16(dst_ptr + x, texels.val[0]); - vst1q_s16(dst_ptr + x + 8, texels.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - case DataType::S32: - { - /* Up-conversion U8 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - - const int16x8x2_t texels = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - - vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - case DataType::F32: - { - /* Up-conversion U8 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - - const int16x8x2_t texels = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - /* Up-conversion U8 -> F16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - - const int16x8x2_t texels = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::U16: - { - /* Up-conversion U8 -> U16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - - const uint16x8x2_t texels = - { - { - vmovl_u8(vget_low_u8(texels_u8)), - vmovl_u8(vget_high_u8(texels_u8)) - } - }; - - vst1q_u16(dst_ptr + x, texels.val[0]); - vst1q_u16(dst_ptr + x + 8, texels.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - default: - ARM_COMPUTE_ERROR("dst data type not supported"); - } - break; - } - case DataType::S16: - { - switch(_dst->info()->data_type()) - { - case DataType::QASYMM8_SIGNED: - { - /* Down-conversion S16 -> QASYMM8_SIGNED */ - if(ConvertPolicy::SATURATE == _policy) - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8x2_t texels = - { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; - - vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8x2_t texels = - { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; - - vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - } - break; - } - case DataType::U8: - { - /* Down-conversion S16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8x2_t texels = - { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; - - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8x2_t texels = - { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; - - vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), - vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - } - break; - } - case DataType::S32: - { - /* Up-conversion S16 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8x2_t texels = - { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; - - const int32x4x4_t texels_s32 = - { - { - vmovl_s16(vget_low_s16(texels.val[0])), - vmovl_s16(vget_high_s16(texels.val[0])), - vmovl_s16(vget_low_s16(texels.val[1])), - vmovl_s16(vget_high_s16(texels.val[1])) - } - }; - - vst1q_s32(dst_ptr + x, texels_s32.val[0]); - vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]); - vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]); - vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - default: - ARM_COMPUTE_ERROR("dst data type not supported"); - } - break; - } - case DataType::U16: - { - switch(_dst->info()->data_type()) - { - case DataType::U8: - { - /* Down-conversion U16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint16x8x2_t texels = - { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; - - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint16x8x2_t texels = - { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; - - vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - - }, - src, dst); - } - break; - } - case DataType::U32: - { - /* Up-conversion U16 -> U32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint16x8x2_t texels = - { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; - - vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0]))); - vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0]))); - vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1]))); - vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1]))); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - - }, - src, dst); - break; - } - default: - ARM_COMPUTE_ERROR("dst data type not supported"); - } - break; - } -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) - case DataType::BFLOAT16: - switch(_dst->info()->data_type()) - { - case DataType::F32: - { - /* Up-conversion BFLOAT16 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint16x8x2_t texels = - { - { - vld1q_u16(reinterpret_cast(src.ptr())), - vld1q_u16(reinterpret_cast(src.ptr()) + 8) - } - }; - - vst1q_f32(reinterpret_cast(dst.ptr()), - vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16))); - vst1q_f32(reinterpret_cast(dst.ptr()) + 4, - vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16))); - vst1q_f32(reinterpret_cast(dst.ptr()) + 8, - vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16))); - vst1q_f32(reinterpret_cast(dst.ptr()) + 12, - vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16))); - } - - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = float(*(src_ptr + x)); - } - }, - src, dst); - break; - } - default: - ARM_COMPUTE_ERROR("dst data type unsupported"); - } - break; -#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - switch(_dst->info()->data_type()) - { - case DataType::QASYMM8_SIGNED: - { - /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t texels = - { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8), - } - }; - - vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - case DataType::QASYMM8: - case DataType::U8: - { - /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t texels = - { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8), - } - }; - - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - - }, - src, dst); - break; - } - case DataType::F32: - { - /* Up-conversion F16 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t texels = - { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8) - } - }; - vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0]))); - vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0]))); - vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1]))); - vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - case DataType::S32: - { - /* Up-conversion F16 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t texels = - { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8) - } - }; - - vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])))); - vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])))); - vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])))); - vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - default: - ARM_COMPUTE_ERROR("dst data type not supported"); - } - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - switch(_dst->info()->data_type()) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - /* Down-conversion F32 -> F16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t texels = - { - { - vld1q_f32(src_ptr + x), - vld1q_f32(src_ptr + x + 4), - vld1q_f32(src_ptr + x + 8), - vld1q_f32(src_ptr + x + 12) - } - }; - - vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); - vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) - case DataType::BFLOAT16: - { - /* Down-conversion F32 -> BFLOAT16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vcvt_bf16_f32(reinterpret_cast(src.ptr()), - reinterpret_cast(dst.ptr())); - wrapper::vcvt_bf16_f32(reinterpret_cast(src.ptr()) + 8, - reinterpret_cast(dst.ptr()) + 8); - } - - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = *(src_ptr + x); - } - }, - src, dst); - break; - } -#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ - case DataType::S32: - { - /* Conversion F32 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t texels = - { - { - vld1q_f32(src_ptr + x), - vld1q_f32(src_ptr + x + 4), - vld1q_f32(src_ptr + x + 8), - vld1q_f32(src_ptr + x + 12), - } - }; - - vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0])); - vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1])); - vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2])); - vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3])); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - case DataType::QASYMM8: - case DataType::U8: - { - /* Down-conversion F32 -> U8 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t texels = - { - { - vld1q_f32(src_ptr + x), - vld1q_f32(src_ptr + x + 4), - vld1q_f32(src_ptr + x + 8), - vld1q_f32(src_ptr + x + 12), - } - }; - - vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); - vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - case DataType::QASYMM8_SIGNED: - { - /* Down-conversion F32 -> QASYMM8_SIGNED */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t texels = - { - { - vld1q_f32(src_ptr + x), - vld1q_f32(src_ptr + x + 4), - vld1q_f32(src_ptr + x + 8), - vld1q_f32(src_ptr + x + 12), - } - }; - - vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1]))))); - vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3]))))); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - - default: - ARM_COMPUTE_ERROR("dst data type not supported"); - } - break; - - case DataType::S32: - switch(_dst->info()->data_type()) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - /* Down-conversion S32 -> F16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t texels = - { - { - vcvtq_f32_s32(vld1q_s32(src_ptr + x)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12)) - } - }; - - vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); - vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - { - /* Conversion S32 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12), - } - }; - - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0])); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1])); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2])); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3])); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - break; - } - case DataType::QASYMM8_SIGNED: - { - /* Down-conversion S32 -> QASYMM8_SIGNED */ - if(ConvertPolicy::SATURATE == _policy) - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12), - } - }; - vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1])))); - vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; - - vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1])))); - vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - } - break; - } - case DataType::QASYMM8: - case DataType::U8: - { - /* Down-conversion S32 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; - vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1])))); - vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); - } - }, - src, dst); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast(src.ptr()); - const auto dst_ptr = reinterpret_cast(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; - - vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); - vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast(*(src_ptr + x)); - } - }, - src, dst); - } - break; - } - default: - ARM_COMPUTE_ERROR("dst data type not supported"); - } - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } -} - -const char *CpuCastKernel::name() const -{ - return "CpuCastKernel.cpp"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuCastKernel.h b/src/core/cpu/kernels/CpuCastKernel.h deleted file mode 100644 index 2a75c5850e..0000000000 --- a/src/core/cpu/kernels/CpuCastKernel.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CAST_KERNEL_H -#define ARM_COMPUTE_CPU_CAST_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Casts a given tensor to a new type - * - * @note When casting between quantized types the scale and zeroPoint are ignored - */ -class CpuCastKernel : public ICpuKernel -{ -public: - CpuCastKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCastKernel); - /** Set the src and dst of the kernel - * - * Valid conversions src -> dst : - * - * - QASYMM8_SIGNED -> S16, S32, F32, F16 - * - QASYMM8 -> U16, S16, S32, F32, F16 - * - U8 -> U16, S16, S32, F32, F16 - * - U16 -> U8, U32 - * - S16 -> QASYMM8_SIGNED, U8, S32 - * - BFLOAT16 -> F32 - * - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8 - * - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8 - * - F32 -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8 - * - * @param[in] src The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32. - * @param[out] dst The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32. - * @param[in] policy Conversion policy. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuCastKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - ConvertPolicy _policy{ ConvertPolicy::SATURATE }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CAST_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuCol2ImKernel.cpp b/src/core/cpu/kernels/CpuCol2ImKernel.cpp deleted file mode 100644 index f860825de6..0000000000 --- a/src/core/cpu/kernels/CpuCol2ImKernel.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuCol2ImKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -using namespace misc::shape_calculator; -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims) -{ - //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - - // Validate configured output - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, false)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - } - - return Status{}; -} -} // namespace - -void CpuCol2ImKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims)); - - _convolved_dims = convolved_dims; - - // Configure kernel window - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, false))); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - - ICpuKernel::configure(win); -} - -Status CpuCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *output, const Size2D &convolved_dims) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, output, convolved_dims)); - return Status{}; -} - -void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - const uint8_t el_size = src->info()->element_size(); - const int output_stride_x = dst->info()->strides_in_bytes().x(); - const int output_stride_y = dst->info()->strides_in_bytes().y(); - const int output_stride_z = dst->info()->strides_in_bytes().z(); - - Window window_out(window); - window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Create iterators - Iterator in(src, window); - Iterator out(dst, window_out); - - execute_window_loop(window, [&](const Coordinates & id) - { - const int hidx = id.y(); - const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x; - std::memcpy(out.ptr() + idx, in.ptr(), el_size); - }, - in, out); -} - -const char *CpuCol2ImKernel::name() const -{ - return "CpuCol2ImKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuCol2ImKernel.h b/src/core/cpu/kernels/CpuCol2ImKernel.h deleted file mode 100644 index 3c1802230b..0000000000 --- a/src/core/cpu/kernels/CpuCol2ImKernel.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_COL2IM_KERNEL_H -#define ARM_COMPUTE_CPU_COL2IM_KERNEL_H - -#include "arm_compute/core/Size2D.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to perform col2im reshaping. - * - * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CpuIm2ColKernel. - * - * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3: - * - * @f[ - * \left( \begin{array}{ccccccccc} - * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\ - * \end{array} \right) - * \rightarrow - * \left( \begin{array}{ccc} - * a0 & a1 & a2 \\ - * a3 & a4 & a5 \\ - * a6 & a7 & a8 \\ - * \end{array} \right) - * @f] - */ -class CpuCol2ImKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuCol2ImKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCol2ImKernel); - /** Set the input and output of the kernel. - * - * @param[in] src The input tensor info to convert. Data types supported: All - * @param[out] dst The output tensor info. 3 lower dimensions represent a single output [width, height, OFM], - * while the rest represent batch of outputs. Data types supported: Same as @p input - * @param[in] convolved_dims Output convolved dimensions. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuCol2ImKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - Size2D _convolved_dims{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_COL2IM_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp deleted file mode 100644 index 16c0efc793..0000000000 --- a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuConcatenateBatchKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -template -void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, const Window &window) -{ - // Offset src - uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); - - // Offset dst - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3]; - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const int window_step_x = 16 / dst->info()->element_size(); - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1)); - - Iterator src_it(src, win); - Iterator dst_it(dst, win); - - const DataType dt = src->info()->data_type(); - const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); - } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); - } -} - -Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY)); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ)); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst); - - return Status{}; -} -} // namespace - -void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst)); - - _func = nullptr; - _batch_offset = batch_offset; - - switch(src->data_type()) - { - case DataType::S8: - case DataType::U8: - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - _func = &batch_concat; - break; - case DataType::S16: - case DataType::U16: - case DataType::F16: - _func = &batch_concat; - break; - case DataType::S32: - case DataType::U32: - case DataType::F32: - _func = &batch_concat; - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } - - // Configure kernel window - Window win = calculate_max_window(*dst, Steps()); - ICpuKernel::configure(win); -} - -Status CpuConcatenateBatchKernel::validate(const arm_compute::ITensorInfo *src, - unsigned int batch_offset, - const arm_compute::ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst)); - return Status{}; -} - -void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), - tensors.get_tensor(TensorType::ACL_DST), - _batch_offset, - window); -} - -const char *CpuConcatenateBatchKernel::name() const -{ - return "CpuConcatenateBatchKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuConcatenateBatchKernel.h b/src/core/cpu/kernels/CpuConcatenateBatchKernel.h deleted file mode 100644 index 1706926fa8..0000000000 --- a/src/core/cpu/kernels/CpuConcatenateBatchKernel.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H -#define ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the batch concatenate kernel. - * The input tensor will be concatenated into the output tensor. - */ -class CpuConcatenateBatchKernel : public ICpuKernel -{ -public: - CpuConcatenateBatchKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateBatchKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] src Source tensor info. Data types supported: All. - * @param[in] batch_offset The offset on axis # 3. - * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. - */ - void configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuConcatenateBatchKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); - -private: - BatchConcatFunction *_func{ nullptr }; - unsigned int _batch_offset{ 0 }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp deleted file mode 100644 index 133499deb6..0000000000 --- a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuConcatenateDepthKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -template -void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, const Window &window) -{ - // Offset source - uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); - - // Offset destination - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2]; - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const int window_step_x = 16 / dst->info()->element_size(); - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1)); - - Iterator src_it(src, win); - Iterator dst_it(dst, win); - - const DataType dt = src->info()->data_type(); - const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); - } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); - } -} - -Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX)); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY)); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output); - - return Status{}; -} -} // namespace - -void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst)); - - _func = nullptr; - _depth_offset = depth_offset; - - switch(src->data_type()) - { - case DataType::QASYMM8: - _func = &depth_concat; - break; - case DataType::QASYMM8_SIGNED: - _func = &depth_concat; - break; - case DataType::F16: - _func = &depth_concat; - break; - case DataType::F32: - _func = &depth_concat; - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } - - // Configure kernel window - Window win = calculate_max_window(*dst, Steps()); - ICpuKernel::configure(win); -} - -Status CpuConcatenateDepthKernel::validate(const arm_compute::ITensorInfo *src, - unsigned int depth_offset, - const arm_compute::ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst)); - return Status{}; -} - -void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), - tensors.get_tensor(TensorType::ACL_DST), - _depth_offset, - window); -} - -const char *CpuConcatenateDepthKernel::name() const -{ - return "CpuConcatenateDepthKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuConcatenateDepthKernel.h b/src/core/cpu/kernels/CpuConcatenateDepthKernel.h deleted file mode 100644 index 3ec19a86d1..0000000000 --- a/src/core/cpu/kernels/CpuConcatenateDepthKernel.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H -#define ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -// Forward declarations -class ITensor; - -namespace cpu -{ -namespace kernels -{ -/** Interface for the depth concatenate kernel. - * The input tensor will be concatenated into the output tensor. - */ -class CpuConcatenateDepthKernel : public ICpuKernel -{ -public: - CpuConcatenateDepthKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateDepthKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] depth_offset The offset on the Z axis. - * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. - * - * @note: The output tensor's low two dimensions can't be smaller than the input one's. - * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. - * - */ - void configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuConcatenateDepthKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); - -private: - DepthConcatFunction *_func{ nullptr }; - unsigned int _depth_offset{ 0 }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp deleted file mode 100644 index dfd442b10a..0000000000 --- a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuConcatenateHeightKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY)); - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); - } - - return Status{}; -} -} // namespace - -void CpuConcatenateHeightKernel::configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst) -{ - ARM_COMPUTE_UNUSED(src); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst)); - - _height_offset = height_offset; - - // Configure kernel window - Window win = calculate_max_window(*dst, Steps()); - ICpuKernel::configure(win); -} - -Status CpuConcatenateHeightKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst)); - return Status{}; -} - -void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - // Offset destination pointer to the correct position - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY]; - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()) * static_cast(dst->info()->element_size()); - const int window_step_x = 16; - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1)); - - // Create iterators - Iterator src_it(src, win); - Iterator dst_it(dst, win); - - const DataType dt = src->info()->data_type(); - const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) - { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - - }, - src_it, dst_it); - } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) - { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_s8(reinterpret_cast(dst_ptr + dst_it.offset() + x), - vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast(src_it.ptr()) + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = src_it.ptr(); - const auto out_ptr = dst_ptr + dst_it.offset(); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); - } -} - -const char *CpuConcatenateHeightKernel::name() const -{ - return "CpuConcatenateHeightKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuConcatenateHeightKernel.h b/src/core/cpu/kernels/CpuConcatenateHeightKernel.h deleted file mode 100644 index e5e15e1aee..0000000000 --- a/src/core/cpu/kernels/CpuConcatenateHeightKernel.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H -#define ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the height concatenate kernel. - * The source tensor will be concatenated into the destination tensor. - */ -class CpuConcatenateHeightKernel : public ICpuKernel -{ -public: - CpuConcatenateHeightKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateHeightKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] src Source tensor info. Data types supported: All - * @param[in] height_offset The starting offset on the Y axis for the output tensor. - * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. - * - */ - void configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuConcatenateHeightKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - unsigned int _height_offset{ 0 }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp deleted file mode 100644 index ad33b0c951..0000000000 --- a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuConcatenateWidthKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0)); - - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); - } - - return Status{}; -} -} // namespace - -void CpuConcatenateWidthKernel::configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst)); - ARM_COMPUTE_UNUSED(dst); - - _width_offset = width_offset; - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - - ICpuKernel::configure(win); -} - -Status CpuConcatenateWidthKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst)); - return Status{}; -} - -void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - // Offset output pointer to the correct position - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0]; - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()) * static_cast(dst->info()->element_size()); - constexpr int window_step_x = 16; - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator src_it(src, win); - Iterator dst_it(dst, win); - const DataType dt = src->info()->data_type(); - const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) - { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); - } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) - { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_s8(reinterpret_cast(dst_ptr + dst_it.offset() + x), - vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast(src_it.ptr() + x)), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = src_it.ptr(); - const auto out_ptr = dst_ptr + dst_it.offset(); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); - } -} - -const char *CpuConcatenateWidthKernel::name() const -{ - return "CpuConcatenateWidthKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuConcatenateWidthKernel.h b/src/core/cpu/kernels/CpuConcatenateWidthKernel.h deleted file mode 100644 index f64191e173..0000000000 --- a/src/core/cpu/kernels/CpuConcatenateWidthKernel.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H -#define ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the width concatenate kernel. - * The source tensor will be concatenated into the destination tensor. - */ -class CpuConcatenateWidthKernel : public ICPPKernel -{ -public: - CpuConcatenateWidthKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateWidthKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] src Source tensor info. Data types supported: All - * @param[in] width_offset The offset on the X axis. - * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. - */ - void configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuConcatenateWidthKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - unsigned int _width_offset{ 0 }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp deleted file mode 100644 index 5406356bc9..0000000000 --- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Types.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, - DataLayout data_layout) - -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output tensor auto initialisation if not yet initialized - auto_init_if_empty(*dst, *src->clone()); - - ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout)); - - const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW; - - const int width_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT); - const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL); - - const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx]; - const unsigned int num_channels = original_input_shape[channel_idx]; - - _factor1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels; - _factor2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane; - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win); -} - -Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, - DataLayout data_layout) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_input_shape.total_size_lower(3)); - ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN); - - // Checks performed when dst is configured - if((dst != nullptr) && (dst->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} - -void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - const unsigned int dst_stride_x = dst->info()->strides_in_bytes().x(); - const unsigned int dst_stride_y = dst->info()->strides_in_bytes().y(); - const unsigned int element_size = src->info()->element_size(); - - Iterator input(src, window); - Iterator output(dst, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size); - }, - input); -} - -const char *CpuConvertFullyConnectedWeightsKernel::name() const -{ - return "CpuConvertFullyConnectedWeightsKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h deleted file mode 100644 index 7baaf13417..0000000000 --- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H -#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa. - * - * @note This function can be applied to the 2D weights used by a Fully Connected layer if: - * - It follows a Convolution layer - * - The data layout used by the network does not match the one the model has been trained in. - * - * @note This function assumes the weights are already reshaped (transposed) - */ -class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel -{ -public: - CpuConvertFullyConnectedWeightsKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertFullyConnectedWeightsKernel); - /** Set the src and dst tensor. - * - * @param[in] src Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All. - * @param[in] dst The converted weights tensor info. Shape and Data Type: Same as @p src. - * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer). - * @param[in] data_layout The data layout the weights have been trained in. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */ - unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */ -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */ \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp deleted file mode 100644 index 26cbb48deb..0000000000 --- a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - - // Validate output if initialized - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); - } - - return Status{}; -} - -std::pair validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) -{ - // Output auto inizialitation if not yet initialized - { - const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED; - const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); - const int offset_correction = is_input_signed ? -128 : 128; - const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); - - auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo)); - } - - return std::make_pair(Status{}, calculate_max_window(*dst)); -} -} // namespace - -void CpuConvertQuantizedSignednessKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - - std::pair win_config = validate_and_configure_window(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICpuKernel::configure(win_config.second); -} - -Status CpuConvertQuantizedSignednessKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); - return Status{}; -} - -void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - const uint8_t mask = 128; - const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{}); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const uint8_t in = *(reinterpret_cast(input_ptr + x)); - *(output_ptr + x) = in ^ mask; - } - }, - input, output); -} - -const char *CpuConvertQuantizedSignednessKernel::name() const -{ - return "CpuConvertQuantizedSignednessKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h deleted file mode 100644 index 2a8f6c364d..0000000000 --- a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H -#define ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */ -class CpuConvertQuantizedSignednessKernel : public ICpuKernel -{ -public: - CpuConvertQuantizedSignednessKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertQuantizedSignednessKernel); - /** Initialize the kernel input and output info. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED. - * @param[out] dst Destination tensor info. Data types supported: opposite of @p src. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuConvertQuantizedSignednessKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuCopyKernel.cpp b/src/core/cpu/kernels/CpuCopyKernel.cpp deleted file mode 100644 index 8ec354b2aa..0000000000 --- a/src/core/cpu/kernels/CpuCopyKernel.cpp +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuCopyKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList()) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4); - - // Validate destination if initialized - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - return Status{}; -} - -std::pair validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) -{ - // Destination auto inizialitation if not yet initialized - auto_init_if_empty(*dst, *src); - return std::make_pair(Status{}, calculate_max_window(*dst)); -} - -std::pair validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) -{ - const TensorShape src_shape = src->tensor_shape(); - const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding); - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(padded_shape)); - // Configure window - const Window win = calculate_max_window(*dst, dst->dimension(0)); - return std::make_pair(Status{}, win); -} - -} // namespace - -void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, padding)); - - _padding = padding; - - std::pair win_config; - if(padding.empty()) - { - win_config = validate_and_configure_window(src, dst); - } - else - { - win_config = validate_and_configure_window_with_padding(src, dst, padding); - } - - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICpuKernel::configure(win_config.second); -} - -Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding)); - - if(padding.empty()) - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first); - } - - return Status{}; -} - -void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - if(_padding.empty()) - { - Window dst_window{ window }; - dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0))); - Window out_slice = dst_window.first_slice_window_1D(); - do - { - Iterator src_it(src, out_slice); - Iterator dst_it(dst, out_slice); - - execute_window_loop(out_slice, [&](const Coordinates &) - { - memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); - }, - src_it, dst_it); - } - while(dst_window.slide_window_slice_1D(out_slice)); - } - else - { - Window src_window{ window }; - src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0))); - - Iterator src_it(src, src_window); - Iterator dst_it(dst, window); - const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size(); - execute_window_loop(window, [&](const Coordinates &) - { - auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size(); - std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes); - }, - src_it, dst_it); - } -} - -const char *CpuCopyKernel::name() const -{ - return "CpuCopyKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuCopyKernel.h b/src/core/cpu/kernels/CpuCopyKernel.h deleted file mode 100644 index e2f1ed60a6..0000000000 --- a/src/core/cpu/kernels/CpuCopyKernel.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_COPY_KERNEL_H -#define ARM_COMPUTE_CPU_COPY_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to perform a copy between two tensors */ -class CpuCopyKernel : public ICpuKernel -{ -public: - CpuCopyKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCopyKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] src Source tensor. Data types supported: All - * @param[out] dst Destination tensor. Data types supported: same as @p src. - * @param[in] padding (Optional) Padding to be applied to the input tensor - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding = PaddingList()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuCopyKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList()); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - PaddingList _padding{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_COPY_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp deleted file mode 100644 index 5530eba9f1..0000000000 --- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp +++ /dev/null @@ -1,950 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/traits.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -constexpr auto data_layout = DataLayout::NHWC; -const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); -const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); -const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - -constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0); -constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1); -constexpr size_t vector_size = 8; - -struct DepthwiseConvolutionRunInfo -{ - const size_t num_read_elements_per_iteration; - const uint32_t x_start; - const uint32_t x_end; - const uint32_t x_step; - const uint32_t x_leftover_start; - const size_t input_stride_y; - const size_t input_stride_z; - const size_t input_max_offset; - const size_t weights_width; - const size_t weights_height; - const size_t weights_stride_y; - const size_t weights_stride_z; - const size_t conv_stride_x; - const size_t conv_stride_y; - const size_t conv_pad_left; - const size_t conv_pad_top; - const size_t input_height; - const size_t input_width; - const size_t input_depth; - - DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT - : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)), - x_start(w.x().start()), - x_end(w.x().end()), - x_step(static_cast(num_read_elements_per_iteration * depth_multiplier)), - x_leftover_start(std::max(static_cast(w.x().end()) - static_cast(x_step) + 1, int32_t(0))), - input_stride_y(input.strides_in_bytes().y()), - input_stride_z(input.strides_in_bytes().z()), - input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()), - weights_width(weights.dimension(width_idx)), - weights_height(weights.dimension(height_idx)), - weights_stride_y(weights.strides_in_bytes().y()), - weights_stride_z(weights.strides_in_bytes().z()), - conv_stride_x(conv_info.stride().first), - conv_stride_y(conv_info.stride().second), - conv_pad_left(conv_info.pad_left()), - conv_pad_top(conv_info.pad_top()), - input_height(input.dimension(height_idx)), - input_width(input.dimension(width_idx)), - input_depth(input.dimension(channel_idx)) - { - } -}; - -inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b) -{ - return vqrdmulhq_n_s32(a, b); -} - -inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b) -{ - return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0); -} - -inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent) -{ - const int32x4_t shift = vdupq_n_s32(-exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); - const int32x4_t fixed = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed, shift); -} - -inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent) -{ - const int32x2_t shift = vdup_n_s32(-exponent); - const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31); - const int32x2_t fixed = vqadd_s32(x, fixup); - return vrshl_s32(fixed, shift); -} - -inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent) -{ - const int32x2_t xs = vdup_n_s32(x); - return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0); -} - -inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation) -{ - const int32_t current_h = base_h + h * dilation.y(); - const bool is_valid_h = current_h >= 0 && current_h < static_cast(run_info.input_height); - - const int32_t current_w = base_w + w * dilation.x(); - const bool is_valid_w = current_w >= 0 && current_w < static_cast(run_info.input_width); - - return is_valid_h && is_valid_w; -} - -template -void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, const Window &window, bool has_biases) -{ - constexpr auto element_per_vector = vector_size / sizeof(T); - using VectorType = typename wrapper::traits::neon_vector::type; - using TagType = typename wrapper::traits::neon_vector::tag_type; - - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window); - - const VectorType zero_vector = wrapper::vdup_n(static_cast(0), TagType{}); - - Window execution_window = window; - execution_window.set(Window::DimX, dim_single_unit_step); - - Window win_input = window; - win_input.set(Window::DimX, dim_manual_loop); - win_input.set(Window::DimY, dim_manual_loop); - win_input.set(Window::DimZ, dim_manual_loop); - - Window win_weights = win_input; - win_weights.set(Window::DimW, dim_manual_loop); - - Window win_output = window; - win_output.set(Window::DimX, dim_manual_loop); - - Iterator input_it(src, win_input); - Iterator weights_it(weights, win_weights); - Iterator output_it(dst, win_output); - Iterator biases_it{}; - - if(has_biases) - { - biases_it = Iterator(biases, win_weights); - } - - execute_window_loop(execution_window, [&](const Coordinates & id) - { - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - - auto const base_weights_ptr = weights_it.ptr(); - uint32_t x = run_info.x_start; - - for(; x < run_info.x_leftover_start; x += run_info.x_step) - { - VectorType acc = zero_vector; - auto weights_ptr = base_weights_ptr; - int64_t input_offset = base_input_offset; - - for(uint32_t h = 0; h < run_info.weights_height; ++h) - { - int64_t offs = input_offset + x * sizeof(T); - for(uint32_t w = 0; w < run_info.weights_width; ++w) - { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? - wrapper::vload(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) : - zero_vector; - const auto weights_vals = wrapper::vload(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); - acc = wrapper::vmla(acc, weights_vals, input_vals); - - offs += dilation.x() * run_info.input_stride_y; - } - - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - if(has_biases) - { - const auto biases_vals = wrapper::vload(reinterpret_cast(biases_it.ptr()) + x); - acc = wrapper::vadd(acc, biases_vals); - } - - wrapper::vstore(reinterpret_cast(output_it.ptr()) + x, acc); - } - - for(; x < run_info.x_end; ++x) - { - auto acc_scalar = T{ 0 }; - auto weights_ptr = base_weights_ptr; - int64_t input_offset = base_input_offset; - - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) - { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? *reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset)) : 0; - const auto weights_vals = *(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); - - acc_scalar += (input_vals * weights_vals); - - offs += dilation.x() * run_info.input_stride_y; - } - - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - if(has_biases) - { - const auto biases_vals = *(reinterpret_cast(biases_it.ptr()) + x); - acc_scalar += biases_vals; - } - *(reinterpret_cast(output_it.ptr()) + x) = acc_scalar; - } - }, - input_it, weights_it, biases_it, output_it); -} - -template -void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases) -{ - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); - - Window execution_window = window; - execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); - - Window win_input = execution_window; - win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); - win_input.set(Window::DimY, dim_manual_loop); - win_input.set(Window::DimZ, dim_manual_loop); - - Window win_weights = window; - win_weights.set_dimension_step(Window::DimX, run_info.x_step); - win_weights.set(Window::DimY, dim_manual_loop); - win_weights.set(Window::DimZ, dim_manual_loop); - win_weights.set(Window::DimW, dim_manual_loop); - - Window win_output = window; - win_output.set_dimension_step(Window::DimX, run_info.x_step); - - Iterator input_it(src, win_input); - Iterator weights_it(weights, win_weights); - Iterator output_it(dst, win_output); - Iterator biases_it{}; - - if(has_biases) - { - biases_it = Iterator(biases, win_weights); - } - - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::vector acc(depth_multiplier, static_cast(0)); - - const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) - { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? *(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) : T(0); - - for(size_t m = 0; m < depth_multiplier; ++m) - { - const auto weights_val = *(reinterpret_cast(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m)); - } - - offs += dilation.x() * run_info.input_stride_y; - } - - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - if(has_biases) - { - for(size_t m = 0; m < depth_multiplier; ++m) - { - const auto biases_val = *(reinterpret_cast(biases_it.ptr() + m * sizeof(T))); - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val; - } - } - else - { - for(size_t m = 0; m < depth_multiplier; ++m) - { - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = acc.at(m); - } - } - }, - input_it, weights_it, biases_it, output_it); -} - -template -void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, std::vector output_multiplier, std::vector output_shift, const Window &window, bool has_biases) // NOLINT -{ - ARM_COMPUTE_UNUSED(output_multiplier, output_shift); - constexpr auto element_per_vector = vector_size / sizeof(T); - using VectorType = typename wrapper::traits::neon_vector::type; - using TagType = typename wrapper::traits::neon_vector::tag_type; - using AccType = int32_t; - using AccArrayType = std::array; - - const auto out_of_bound_value = PixelValue(static_cast(0), src->info()->data_type(), src->info()->quantization_info()).get(); - const auto out_of_bound_vector = wrapper::vdup_n(static_cast(out_of_bound_value), TagType{}); - - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window); - - const int32_t input_qoffset = src->info()->quantization_info().uniform().offset; - const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset; - const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset; - const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset; - - Window execution_window = window; - execution_window.set(Window::DimX, dim_single_unit_step); - - Window win_input = window; - win_input.set(Window::DimX, dim_manual_loop); - win_input.set(Window::DimY, dim_manual_loop); - win_input.set(Window::DimZ, dim_manual_loop); - - Window win_weights = win_input; - win_weights.set(Window::DimW, dim_manual_loop); - - Window win_output = window; - win_output.set(Window::DimX, dim_manual_loop); - - Iterator input_it(src, win_input); - Iterator weights_it(weights, win_weights); - Iterator output_it(dst, win_output); - Iterator biases_it{}; - - if(has_biases) - { - biases_it = Iterator(biases, win_weights); - } - - execute_window_loop(execution_window, [&](const Coordinates & id) - { - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto const base_weights_ptr = weights_it.ptr(); - size_t x = run_info.x_start; - - for(; x < run_info.x_leftover_start; x += run_info.x_step) - { - AccArrayType acc{}; - AccArrayType in_sum{}; - AccArrayType we_sum{}; - - auto weights_ptr = base_weights_ptr; - auto input_offset = base_input_offset; - - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) - { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? - wrapper::vload(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) : - out_of_bound_vector; - const auto weights_vals = wrapper::vload(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); - - for(size_t i = 0; i < element_per_vector; ++i) - { - acc.at(i) += input_vals[i] * weights_vals[i]; - in_sum.at(i) += input_vals[i]; - we_sum.at(i) += weights_vals[i]; - } - - offs += dilation.x() * run_info.input_stride_y; - } - - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - VectorType out_vals = wrapper::vdup_n(static_cast(0), TagType{}); - for(size_t i = 0; i < element_per_vector; ++i) - { - acc.at(i) -= in_sum.at(i) * weights_qoffset; - acc.at(i) -= we_sum.at(i) * input_qoffset; - acc.at(i) += k_offset; - - if(has_biases) - { - acc.at(i) += *(reinterpret_cast(biases_it.ptr() + i * sizeof(int32_t)) + x); - } - - const int32_t out_mul = output_multiplier.at(x + i); - const int32_t out_shift = output_shift.at(x + i); - if(out_shift < 0) - { - acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset; - } - else - { - acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset; - } - out_vals[i] = static_cast(utility::clamp(acc.at(i))); - } - - wrapper::vstore(reinterpret_cast(output_it.ptr()) + x, out_vals); - } - - // left-over - for(; x < run_info.x_end; ++x) - { - AccType acc = 0; - AccType in_sum = 0; - AccType we_sum = 0; - - auto weights_ptr = base_weights_ptr; - auto input_offset = base_input_offset; - - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) - { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? - *reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset)) : - out_of_bound_value; - const auto weights_val = *(reinterpret_cast(weights_ptr + w * run_info.weights_stride_y) + x); - - acc += input_val * weights_val; - in_sum += input_val; - we_sum += weights_val; - - offs += dilation.x() * run_info.input_stride_y; - } - - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - T out_vals{ 0 }; - - acc -= in_sum * weights_qoffset; - acc -= we_sum * input_qoffset; - acc += k_offset; - - if(has_biases) - { - acc += *(reinterpret_cast(biases_it.ptr()) + x); - } - - const int32_t out_mul = output_multiplier.at(x); - const int32_t out_shift = output_shift.at(x); - - if(out_shift < 0) - { - acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset; - } - else - { - acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset; - } - - out_vals = static_cast(utility::clamp(acc)); - *(reinterpret_cast(output_it.ptr()) + x) = out_vals; - } - }, - input_it, weights_it, biases_it, output_it); -} - -template -void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, std::vector output_multiplier, std::vector output_shift, const Window &window, bool has_biases) // NOLINT -{ - using AccType = int32_t; - - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); - - const auto out_of_bound_value = PixelValue(static_cast(0), src->info()->data_type(), src->info()->quantization_info()).get(); - - const int32_t input_qoffset = src->info()->quantization_info().uniform().offset; - const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset; - const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset; - const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset; - - Window execution_window = window; - execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); - - Window win_input = execution_window; - win_input.set(Window::DimY, dim_manual_loop); - win_input.set(Window::DimZ, dim_manual_loop); - - Window win_weights = window; - win_weights.set_dimension_step(Window::DimX, run_info.x_step); - win_weights.set(Window::DimY, dim_manual_loop); - win_weights.set(Window::DimZ, dim_manual_loop); - win_weights.set(Window::DimW, dim_manual_loop); - - Window win_output = window; - win_output.set_dimension_step(Window::DimX, run_info.x_step); - - Iterator input_it(src, win_input); - Iterator weights_it(weights, win_weights); - Iterator output_it(dst, win_output); - Iterator biases_it{}; - - if(has_biases) - { - biases_it = Iterator(biases, win_weights); - } - - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::vector acc(depth_multiplier, 0); - std::vector we_sum(depth_multiplier, 0); - AccType in_sum = 0; - - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) - { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? *(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))) : out_of_bound_value; - - for(size_t m = 0; m < depth_multiplier; ++m) - { - const auto weights_val = *(reinterpret_cast(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - acc.at(m) += input_val * weights_val; - - we_sum.at(m) += weights_val; - } - - offs += dilation.x() * run_info.input_stride_y; - in_sum += input_val; - } - - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - for(size_t m = 0; m < depth_multiplier; ++m) - { - acc.at(m) -= in_sum * weights_qoffset; - acc.at(m) -= we_sum.at(m) * input_qoffset; - acc.at(m) += k_offset; - - if(has_biases) - { - acc.at(m) += *(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); - } - - const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m); - const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m); - if(out_shift < 0) - { - acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset; - } - else - { - acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset; - } - *(reinterpret_cast(output_it.ptr() + m * sizeof(T))) = static_cast(utility::clamp(acc.at(m))); - } - }, - input_it, weights_it, biases_it, output_it); -} - -template -void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, std::vector output_multiplier, std::vector output_shift, const Window &window, bool has_biases) // NOLINT -{ - constexpr int half_vec = vector_size / 2; - - using AccType = int32_t; - using AccVectorType = typename wrapper::traits::neon_vector::type; - using AccVectorTagType = typename wrapper::traits::neon_vector::tag_type; - using TagType = typename wrapper::traits::neon_vector::tag_type; - - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); - - const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast(src->info()->quantization_info().uniform().offset), TagType{}))); - const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast(weights->info()->quantization_info().uniform().offset), TagType{}))); - const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{}); - - const auto lower = wrapper::vdup_n(static_cast(std::numeric_limits::lowest()), AccVectorTagType{}); - const auto upper = wrapper::vdup_n(static_cast(std::numeric_limits::max()), AccVectorTagType{}); - const auto zero = wrapper::vdup_n(static_cast(0), AccVectorTagType{}); - - const auto out_mul = output_multiplier.at(0); - const auto out_shift = output_shift.at(0); - - Window execution_window = window; - execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); - - Window win_input = execution_window; - win_input.set(Window::DimY, dim_manual_loop); - win_input.set(Window::DimZ, dim_manual_loop); - - Window win_weights = window; - win_weights.set_dimension_step(Window::DimX, run_info.x_step); - win_weights.set(Window::DimY, dim_manual_loop); - win_weights.set(Window::DimZ, dim_manual_loop); - win_weights.set(Window::DimW, dim_manual_loop); - - Window win_output = window; - win_output.set_dimension_step(Window::DimX, run_info.x_step); - - Iterator input_it(src, win_input); - Iterator weights_it(weights, win_weights); - Iterator output_it(dst, win_output); - Iterator biases_it{}; - - if(has_biases) - { - biases_it = Iterator(biases, win_weights); - } - - std::vector acc0(depth_multiplier / vector_size); - std::vector acc1(depth_multiplier / vector_size); - - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::fill(begin(acc0), end(acc0), zero); - std::fill(begin(acc1), end(acc1), zero); - - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - const int32_t current_h = input_z + h * dilation.y(); - if(current_h >= 0 && current_h < static_cast(run_info.input_height)) - { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) - { - const int32_t current_w = input_y + w * dilation.x(); - if(current_w >= 0 && current_w < static_cast(run_info.input_width)) - { - const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast(input_it.ptr() + std::min(static_cast(offs), run_info.input_max_offset))), TagType{}); - const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8)); - const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec); - - for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) - { - const auto weights_8x8 = wrapper::vload(reinterpret_cast(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8)); - const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec); - - acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs)); - acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs)); - } - } - - offs += dilation.x() * run_info.input_stride_y; - } - } - - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) - { - if(has_biases) - { - const auto bias_val0 = wrapper::vloadq(reinterpret_cast(biases_it.ptr() + m * sizeof(int32_t))); - const auto bias_val1 = wrapper::vloadq(reinterpret_cast(biases_it.ptr() + (m + half_vec) * sizeof(int32_t))); - - acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0); - acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1); - } - - if(out_shift < 0) - { - acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec); - acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec); - } - else - { - acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec); - acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec); - } - - acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper); - acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper); - - const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), - wrapper::vmovn(acc1.at(i))); - - if(std::is_same::value) - { - wrapper::vstore(reinterpret_cast(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val))); - } - else - { - wrapper::vstore(reinterpret_cast(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val)); - } - } - }, - input_it, weights_it, biases_it, output_it); -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom()); - ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1)); - ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1)); - - if(is_data_type_quantized_per_channel(weights->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - } - - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0)); - - if(is_data_type_quantized_asymmetric(src->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); - } - } - - if(dst->total_size() != 0) - { - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - return Status{}; -} -} // namespace - -void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info)); - - _conv_info = info.pad_stride_info; - _depth_multiplier = info.depth_multiplier; - _dilation = info.dilation; - _has_biases = (biases != nullptr); - - if(is_data_type_quantized(src->data_type())) - { - const auto input_scale = src->quantization_info().uniform().scale; - const auto output_scale = dst->quantization_info().uniform().scale; - - auto weights_scale = weights->quantization_info().scale(); - if(!is_data_type_quantized_per_channel(weights->data_type())) - { - for(size_t i = 1; i < weights->dimension(channel_idx); ++i) - { - weights_scale.push_back(weights_scale.front()); - } - } - - for(const auto &s : weights_scale) - { - int32_t out_mult = 0; - int32_t out_shift = 0; - const float multiplier = input_scale * s / output_scale; - arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift); - - _output_multiplier.push_back(out_mult); - _output_shift.push_back(out_shift); - } - } - - switch(weights->data_type()) - { - case DataType::QASYMM8: - _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise; - break; - case DataType::QASYMM8_SIGNED: - _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise; - break; - case DataType::QSYMM8_PER_CHANNEL: - if(src->data_type() == DataType::QASYMM8) - { - _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise; - } - else - { - _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise; - } - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise; - break; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise; - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info())); - - Window win = calculate_max_window(*dst, Steps()); - ICpuKernel::configure(win); -} - -Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info)); - return Status{}; -} - -template > -void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - if(_depth_multiplier == 1) - { - depthwise_loop_multiplier1_fp(src, weights, biases, dst, _conv_info, _dilation, window, has_biases); - } - else - { - depthwise_loop_generic_fp(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases); - } -} - -template > -void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - if(_depth_multiplier == 1) - { - depthwise_loop_multiplier1_quantized(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases); - } - else - { - const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0); - const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type())); - - if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8) - { - depthwise_loop_pow2_quantized_per_tensor(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases); - } - else - { - depthwise_loop_generic_quantized(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases); - } - } -} - -void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - (this->*_func)(src, weights, biases, dst, window, _has_biases); -} - -const char *CpuDepthwiseConv2dNativeKernel::name() const -{ - return "CpuDepthwiseConv2dNativeKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h deleted file mode 100644 index eb7041f7b6..0000000000 --- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H -#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H - -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" -#include "support/Requires.h" - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#include -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the kernel to run a depthwise convolution native on a tensor. */ -class CpuDepthwiseConv2dNativeKernel : public ICpuKernel -{ -public: - CpuDepthwiseConv2dNativeKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel); - - /** Initialize the function's source, destination and parameters. - * - * @note Supported data layouts: NHWC - * - * @param[in] src Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H]. - * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. - * @param[out] dst Destination tensor. Data type supported: Same as @p src. - * @param[in] info Depthwise convolution meta-data. - * - */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDepthwiseConv2dNativeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - template - using FloatEnalber = typename std::enable_if::value, int>::type; - - template = 0> - void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases); - - template - using Quantized8bitEnalber = typename std::enable_if < std::is_same::value || std::is_same::value, int >::type; - - template = 0> - void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases); - - /** Common signature for all the specialised depthwise convolution native functions - * - * @param[in] window Region on which to execute the kernel. - */ - using DepthwiseFunctionPtr = void (CpuDepthwiseConv2dNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases); - - DepthwiseFunctionPtr _func{ nullptr }; - PadStrideInfo _conv_info{}; - unsigned int _depth_multiplier{ 1 }; - Size2D _dilation{}; - std::vector _output_multiplier{}; - std::vector _output_shift{}; - bool _has_biases{ false }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.cpp b/src/core/cpu/kernels/CpuDequantizeKernel.cpp deleted file mode 100644 index 42b5439697..0000000000 --- a/src/core/cpu/kernels/CpuDequantizeKernel.cpp +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuDequantizeKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NESymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16); - - if(dst->tensor_shape().total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} - -template -inline void store_result(T *ptr, const float32x4x4_t &v) -{ - ARM_COMPUTE_UNUSED(ptr, v); -} - -template <> -inline void store_result(float *ptr, const float32x4x4_t &v) -{ - wrapper::vstore(ptr, v.val[0]); - wrapper::vstore(ptr + 4, v.val[1]); - wrapper::vstore(ptr + 8, v.val[2]); - wrapper::vstore(ptr + 12, v.val[3]); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline void store_result(float16_t *ptr, const float32x4x4_t &v) -{ - wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); - wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template -inline void store_result(T *ptr, const float32x4x2_t &v) -{ - ARM_COMPUTE_UNUSED(ptr, v); -} - -template <> -inline void store_result(float *ptr, const float32x4x2_t &v) -{ - wrapper::vstore(ptr, v.val[0]); - wrapper::vstore(ptr + 4, v.val[1]); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline void store_result(float16_t *ptr, const float32x4x2_t &v) -{ - wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template -void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window) -{ - const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); - const float scale = qinfo.scale; - const int32_t offset = qinfo.offset; - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win_collapsed); - Iterator out(output, win_collapsed); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale, offset); - - store_result(reinterpret_cast(out_ptr + x), vdeq); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto val = *(in_ptr + x); - *(out_ptr + x) = static_cast(Qasymm8QuantizationHelper::dequantize(val, qinfo)); - } - }, - in, out); -} - -template -void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window) -{ - const auto scale = input->info()->quantization_info().scale(); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - // Reset first dimension to handle tail calculations manually - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win); - Iterator out(output, win); - - execute_window_loop(win, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale[id.z()]); - - store_result(reinterpret_cast(out_ptr + x), vdeq); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast(dequantize(val, scale[id.z()])); - } - }, - in, out); -} - -template -void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window) -{ - const auto scale = input->info()->quantization_info().scale(); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - // Reset first dimension to handle tail calculations manually - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win); - Iterator out(output, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t vscale = - { - { - scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], - scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7], - scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11], - scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15] - } - }; - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, vscale); - - store_result(reinterpret_cast(out_ptr + x), vdeq); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast(dequantize(val, scale[x])); - } - }, - in, out); -} - -template -void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window) -{ - const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); - const float scale = qinfo.scale; - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win_collapsed); - Iterator out(output, win_collapsed); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale); - - store_result(reinterpret_cast(out_ptr + x), vdeq); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast(dequantize(val, scale)); - } - }, - in, out); -} - -template -void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window) -{ - const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); - const float scale = qinfo.scale; - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create iterators - Iterator in(input, win_collapsed); - Iterator out(output, win_collapsed); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize_int16(vin, scale); - - store_result(reinterpret_cast(out_ptr + x), vdeq); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int16_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast(dequantize_qsymm16(val, scale)); - } - }, - in, out); -} - -template -void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) -{ - switch(input->info()->data_type()) - { - case DataType::QASYMM8: - run_dequantization_qasymm8(input, output, window); - break; - case DataType::QASYMM8_SIGNED: - run_dequantization_qasymm8(input, output, window); - break; - case DataType::QSYMM8_PER_CHANNEL: - input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc(input, output, window) : run_dequantization_qsymm8_per_channel_nchw(input, output, window); - break; - case DataType::QSYMM8: - run_dequantization_qsymm8(input, output, window); - break; - case DataType::QSYMM16: - run_dequantization_qsymm16(input, output, window); - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } -} -} // namespace - -void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32); - - ICpuKernel::configure(win); -} - -Status CpuDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); - return Status{}; -} - -void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - switch(dst->info()->data_type()) - { - case DataType::F32: - run_dequantization_core(src, dst, window); - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - run_dequantization_core(src, dst, window); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } -} -const char *CpuDequantizeKernel::name() const -{ - return "CpuDequantizeKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.h b/src/core/cpu/kernels/CpuDequantizeKernel.h deleted file mode 100644 index e80aa3aaad..0000000000 --- a/src/core/cpu/kernels/CpuDequantizeKernel.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H -#define ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the dequantization layer kernel. */ -class CpuDequantizeKernel : public ICpuKernel -{ -public: - CpuDequantizeKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDequantizeKernel); - /** Set input, output tensors. - * - * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. - * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuDequantizeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp deleted file mode 100644 index faff55e905..0000000000 --- a/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp +++ /dev/null @@ -1,1385 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h" - -#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -using namespace arm_compute::detail; - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template -float16x8_t internal_vld1q(const float16_t *in); - -template <> -float16x8_t internal_vld1q<1>(const float16_t *in) -{ - return vld1q_f16(in); -} - -template <> -float16x8_t internal_vld1q<2>(const float16_t *in) -{ - const float16x8x2_t tmp = vld2q_f16(in); - return tmp.val[0]; -} - -template <> -float16x8_t internal_vld1q<3>(const float16_t *in) -{ - const float16x8x3_t tmp = vld3q_f16(in); - return tmp.val[0]; -} - -inline float16x8_t internal_vdupq_n(float16_t v) -{ - return vdupq_n_f16(v); -} - -inline void internal_vst1q(float16_t *p, const float16x8_t &v) -{ - vst1q_f16(p, v); -} - -float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y) -{ - return vmulq_f16(x, y); -} - -inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z) -{ - return vaddq_f16(x, vmulq_f16(y, z)); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template -float32x4_t internal_vld1q(const float *in); - -template <> -float32x4_t internal_vld1q<1>(const float *in) -{ - return vld1q_f32(in); -} - -template <> -float32x4_t internal_vld1q<2>(const float *in) -{ - const float32x4x2_t tmp = vld2q_f32(in); - return tmp.val[0]; -} - -template <> -float32x4_t internal_vld1q<3>(const float *in) -{ - const float32x4x3_t tmp = vld3q_f32(in); - return tmp.val[0]; -} - -inline float32x4_t internal_vdupq_n(float v) -{ - return vdupq_n_f32(v); -} - -inline void internal_vst1q(float *p, const float32x4_t &v) -{ - vst1q_f32(p, v); -} - -float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y) -{ - return vmulq_f32(x, y); -} - -inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z) -{ - return vmlaq_f32(x, y, z); -} - -constexpr int small_tensor_size_optim = 8; -inline bool run_optim_small_tensor_info(const ITensorInfo *t) -{ - return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim; -} - -inline bool run_optim_small_tensor(const ITensor *t) -{ - return run_optim_small_tensor_info(t->info()); -} - -// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8 -// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to -// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer. -template -class convolver_w1x1_i8x8_f32 -{ -public: - static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) - { - ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim); - ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim); - - const int input_stride_x = src->info()->strides_in_bytes().x(); - const int input_stride_y = src->info()->strides_in_bytes().y(); - const int input_stride_z = src->info()->strides_in_bytes().z(); - const int output_stride_y = dst->info()->strides_in_bytes().y(); - const int output_stride_z = dst->info()->strides_in_bytes().z(); - const int kernel_stride_z = weights->info()->strides_in_bytes().z(); - const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_h = dst->info()->dimension(1); - const int range_z = window.z().end() - window.z().start(); - const int kernel_depth = weights->info()->dimension(Window::DimZ); - const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); - - // setup output window for the iterator - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); - window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); - window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z)); - - // setup input window for the iterator - Window window_in = window; - // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 - window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - Iterator out(dst, window_out); - Iterator in(src, window_in); - Iterator k(weights, window_k); - - const uint8_t *k_ptr = k.ptr(); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y; - uint8_t *out_ptr = out.ptr(); - int ih = 0; - int oh = 0; - std::array accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - std::array accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - for(int oz = 0; oz < range_z; ++oz) - { - accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f); - accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f); - auto p_out_base = out_ptr + oz * output_stride_z; - for(int p = 0; p < kernel_depth; ++p) - { - const auto k_val = reinterpret_cast(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w); - const auto vk0 = internal_vdupq_n(*k_val); - for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) - { - const int offset_xy = ih * input_stride_y; - auto in_val = reinterpret_cast(input_ptr + p * input_stride_z + offset_xy); - auto v_in0 = internal_vld1q(in_val); - auto v_in1 = internal_vld1q(in_val + 4); - accum0[oh] = vmlaq_f32(accum0[oh], vk0, v_in0); - accum1[oh] = vmlaq_f32(accum1[oh], vk0, v_in1); - } - } - for(oh = 0; oh < output_h; ++oh) - { - auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); - vst1q_f32(p_out, accum0[oh]); - vst1q_f32(p_out + 4, accum1[oh]); - } - } - }, - in, out); - } -}; - -template -class convolver_1x1 -{ -public: - static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) - { - const int input_stride_x = src->info()->strides_in_bytes().x(); - const int input_stride_y = src->info()->strides_in_bytes().y(); - const int input_stride_z = src->info()->strides_in_bytes().z(); - const int output_stride_y = dst->info()->strides_in_bytes().y(); - const int output_stride_z = dst->info()->strides_in_bytes().z(); - const int kernel_stride_z = weights->info()->strides_in_bytes().z(); - const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_w = dst->info()->dimension(0); - const int output_h = dst->info()->dimension(1); - const int range_z = window.z().end() - window.z().start(); - const int kernel_depth = weights->info()->dimension(Window::DimZ); - const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); - - // setup output window for the iterator - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); - window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); - window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z)); - - // setup input window for the iterator - Window window_in = window; - // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 - window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - Iterator out(dst, window_out); - Iterator in(src, window_in); - Iterator k(weights, window_k); - - const uint8_t *k_ptr = k.ptr(); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - /* - For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1> - */ - const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y; - uint8_t *out_ptr = out.ptr(); - int ih = 0; - int oh = 0; - for(int oz = 0; oz < range_z; ++oz) - { - auto p_out_base = out_ptr + oz * output_stride_z; - // Step 1 - { - const auto k_val = reinterpret_cast(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w); - const auto vk = internal_vdupq_n(*k_val); - for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) - { - const int offset_xy = ih * input_stride_y; - auto in_val = reinterpret_cast(input_ptr + (0 * input_stride_z + offset_xy)); - auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); - for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration) - { - internal_vst1q(p_out, internal_vmull(vk, internal_vld1q(in_val))); - } - } - } - - // Step 2 - for(int p = 1; p < kernel_depth; ++p) - { - const auto k_val = reinterpret_cast(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w); - const auto vk = internal_vdupq_n(*k_val); - for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) - { - const int offset_xy = ih * input_stride_y; - auto in_val = reinterpret_cast(input_ptr + p * input_stride_z + offset_xy); - auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); - for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration) - { - internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q(in_val))); - } - } - } - } - }, - in, out); - } -}; - -template -float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, - const float *m0, const float *m1, const float *m2, const float *m3, const float *m4); - -inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2) -{ - const float32x4x3_t m00 = - { - { - vld1q_dup_f32(m0), - vld1q_dup_f32(m1), - vld1q_dup_f32(m2) - } - }; - return m00; -} - -inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4) -{ - const float32x4x2_t m00 = - { - { - vld1q_dup_f32(m3), - vld1q_dup_f32(m4) - } - }; - return m00; -} - -inline float32x4x3_t load_input(const float *const in) -{ - const float32x4x3_t vin = - { - { - vld1q_f32(in), - vld1q_f32(in + 4), - vld1q_f32(in + 8) - } - }; - return vin; -} - -template <> -inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, - const float *m0, const float *m1, const float *m2, const float *m3, const float *m4) -{ - const float32x4x3_t vin0 = load_input(in_0); - const float32x4x3_t vin1 = load_input(in_1); - const float32x4x3_t vin2 = load_input(in_2); - const float32x4x3_t vin3 = load_input(in_3); - const float32x4x3_t vin4 = load_input(in_4); - const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0); - const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0); - const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1); - const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1); - const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2); - const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2); - const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3); - const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3); - const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4); - const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4); - - float32x4x2_t out = - { - { - vmulq_f32(vin0.val[0], m00.val[0]), - vmulq_f32(vin0.val[1], m00.val[0]) - } - }; - - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]); - - out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]); - - out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]); - - out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]); - - out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]); - out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]); - - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]); - - out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]); - - out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]); - - out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]); - - out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]); - out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]); - out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]); - - return out; -} - -template <> -inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, - const float *m0, const float *m1, const float *m2, const float *m3, const float *m4) -{ - float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); - return out; -} - -template <> -inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, - const float *m0, const float *m1, const float *m2, const float *m3, const float *m4) -{ - float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4); - out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); - return out; -} - -template -class convolver_3x3 -{ -public: - static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) - { - ARM_COMPUTE_UNUSED(num_elems_read_per_iteration); - const int input_stride_x = src->info()->strides_in_bytes().x(); - const int input_stride_y = src->info()->strides_in_bytes().y(); - const int input_stride_z = src->info()->strides_in_bytes().z(); - const int output_stride_y = dst->info()->strides_in_bytes().y(); - const int output_stride_z = dst->info()->strides_in_bytes().z(); - const int kernel_stride_x = weights->info()->strides_in_bytes().x(); - const int kernel_stride_y = weights->info()->strides_in_bytes().y(); - const int kernel_stride_z = weights->info()->strides_in_bytes().z(); - const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_w = dst->info()->dimension(0); - const int output_h = dst->info()->dimension(1); - const int num_planes_z = window.z().end() - window.z().start(); - const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex); - const int kernel_depth = weights->info()->dimension(Window::DimZ); - const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); - - // setup output window for the iterator - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); - window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); - window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z)); - - // setup input window for the iterator - Window window_in = window; - // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 - window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - - Iterator out(dst, window_out); - Iterator in(src, window_in); - Iterator k(weights, window_k); - - const uint8_t *k_ptr = k.ptr(); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y; - uint8_t *out_ptr = out.ptr(); - int ih = 0; - int oh = 0; - /* - Each thread executing this kernel computes one or more output's volume planes. - - Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15], - the third thread [16,24] and the fourth thread [25,31]. - - The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this - is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value. - - The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages: - 1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values. - 2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1. - */ - for(int oz = 0; oz < num_planes_z; ++oz) - { - const int zoffset = id.z() + oz; - uint8_t *p_out_base = out_ptr + oz * output_stride_z; - // Step 1 - { - const auto ptr_k_r0 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r1 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r2 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x); - const auto vk_r0 = load_matrix_row(ptr_k_r0); - const auto vk_r1 = load_matrix_row(ptr_k_r1); - const auto vk_r2 = load_matrix_row(ptr_k_r2); - for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) - { - auto in_top = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y); - auto in_mid = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y); - auto in_low = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y); - auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); - for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, - in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) - { - convolve_3x3(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); - } - } - } - // Step 2 - for(int p = 1; p < kernel_depth; ++p) - { - const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w; - const uint8_t *input_base = input_ptr + p * input_stride_z; - const auto ptr_k_r0 = reinterpret_cast(ptr_k_base); - const auto ptr_k_r1 = reinterpret_cast(ptr_k_base + kernel_stride_y); - const auto ptr_k_r2 = reinterpret_cast(ptr_k_base + kernel_stride_y * 2); - const auto vk_r0 = load_matrix_row(ptr_k_r0); - const auto vk_r1 = load_matrix_row(ptr_k_r1); - const auto vk_r2 = load_matrix_row(ptr_k_r2); - for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) - { - auto in_top = reinterpret_cast(input_base + (ih + 0) * input_stride_y); - auto in_mid = reinterpret_cast(input_base + (ih + 1) * input_stride_y); - auto in_low = reinterpret_cast(input_base + (ih + 2) * input_stride_y); - auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); - for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, - in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) - { - convolve_3x3(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); - } - } - } - } - }, - in, out); - } -}; - -template -class convolver_5x5 -{ -public: - static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) - { - ARM_COMPUTE_UNUSED(num_elems_read_per_iteration); - const int input_stride_x = src->info()->strides_in_bytes().x(); - const int input_stride_y = src->info()->strides_in_bytes().y(); - const int input_stride_z = src->info()->strides_in_bytes().z(); - const int output_stride_y = dst->info()->strides_in_bytes().y(); - const int output_stride_z = dst->info()->strides_in_bytes().z(); - const int kernel_stride_x = weights->info()->strides_in_bytes().x(); - const int kernel_stride_y = weights->info()->strides_in_bytes().y(); - const int kernel_stride_z = weights->info()->strides_in_bytes().z(); - const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_w = dst->info()->dimension(0); - const int output_h = dst->info()->dimension(1); - const int num_planes_z = window.z().end() - window.z().start(); - const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex); - const int kernel_depth = weights->info()->dimension(Window::DimZ); - const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); - - // setup output window for the iterator - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); - window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); - window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z)); - - // setup input window for the iterator - Window window_in = window; - // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 - window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window window_k = calculate_max_window(*weights->info(), Steps(1u)); - - Iterator out(dst, window_out); - Iterator in(src, window_in); - Iterator k(weights, window_k); - - const uint8_t *k_ptr = k.ptr(); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y; - uint8_t *out_ptr = out.ptr(); - int ih = 0; - int oh = 0; - for(int oz = 0; oz < num_planes_z; ++oz) - { - const int zoffset = id.z() + oz; - uint8_t *p_out_base = out_ptr + oz * output_stride_z; - // Step 1 - { - const auto ptr_k_r0 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r1 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r2 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r3 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r4 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x); - for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) - { - auto in_0 = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y); - auto in_1 = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y); - auto in_2 = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y); - auto in_3 = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y); - auto in_4 = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y); - auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); - for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, - in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration) - { - auto vres = convolve_5x5(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4); - store_results(p_out, vres); - } - } - } - // Step 2 - for(int p = 1; p < kernel_depth; ++p) - { - const auto ptr_k_r0 = reinterpret_cast(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r1 = reinterpret_cast(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r2 = reinterpret_cast(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r3 = reinterpret_cast(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x); - const auto ptr_k_r4 = reinterpret_cast(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x); - - for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) - { - auto in_0 = reinterpret_cast(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y); - auto in_1 = reinterpret_cast(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y); - auto in_2 = reinterpret_cast(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y); - auto in_3 = reinterpret_cast(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y); - auto in_4 = reinterpret_cast(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y); - auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); - for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, - in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration) - { - auto vres = convolve_5x5(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4); - accumulate_results(p_out, vres); - } - } - } - } - }, - in, out); - } -}; - -float vreduce(const float32x4_t &v) -{ - auto v0 = wrapper::vgethigh(v); - auto v1 = wrapper::vgetlow(v); - auto v_out = wrapper::vadd(v0, v1); - - float a = wrapper::vgetlane(v_out, 0); - float b = wrapper::vgetlane(v_out, 1); - return a + b; -} - -template -inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) -{ - const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - switch(conv_stride_x) - { - case 1: - convolver_1x1::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - case 2: - convolver_1x1::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - case 3: - convolver_1x1::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - } -} - -template <> -inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) -{ - const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - if(run_optim_small_tensor(src)) - { - switch(conv_stride_x) - { - case 1: - convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info); - break; - case 2: - convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info); - break; - case 3: - convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info); - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - } - } - else - { - switch(conv_stride_x) - { - case 1: - convolver_1x1::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - case 2: - convolver_1x1::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - case 3: - convolver_1x1::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - } - } -} - -template -inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) -{ - const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - switch(conv_stride_x) - { - case 1: - convolver_3x3::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - case 2: - convolver_3x3::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - case 3: - convolver_3x3::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - } -} - -template -inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, - const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) -{ - const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - switch(conv_stride_x) - { - case 1: - convolver_5x5::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - case 2: - convolver_5x5::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - case 3: - convolver_5x5::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - } -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - - const DataLayout data_layout = src->data_layout(); - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported."); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16)); - - // Checks performed when output is configured - if(dst->total_size() != 0) - { - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - - DataType data_type = src->data_type(); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); - ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row, - unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size) -{ - ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - - const DataLayout data_layout = src->data_layout(); - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - - // Calculate right and bottom border - unsigned int kernel_size = weights->dimension(width_idx); - const int conv_stride_x = std::get<0>(conv_info.stride()); - const int conv_stride_y = std::get<1>(conv_info.stride()); - const int input_width = src->dimension(width_idx); - - Window win{}; - bool window_changed = false; - - if(data_layout == DataLayout::NCHW) - { - switch(kernel_size) - { - case 1: - { - switch(src->data_type()) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - num_elems_written_per_iteration = 8; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - if(run_optim_small_tensor_info(src)) - { - num_elems_written_per_iteration = 8; - } - else - { - num_elems_written_per_iteration = 4; - } - break; - default: - ARM_COMPUTE_ERROR("Data type not supported."); - break; - } - num_weight_elems_read_per_row = kernel_size; - num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration; - break; - } - case 3: - switch(src->data_type()) - { - case DataType::F32: - num_weight_elems_read_per_row = 4 + kernel_size - 1; - num_elems_read_per_iteration = 12; - num_elems_written_per_iteration = 16 >> conv_stride_x; - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - num_weight_elems_read_per_row = 8 + kernel_size - 1; - num_elems_read_per_iteration = 24; - num_elems_written_per_iteration = 32 >> conv_stride_x; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Data type not supported."); - break; - } - break; - case 5: - { - switch(src->data_type()) - { - case DataType::F32: - num_weight_elems_read_per_row = 4 + kernel_size - 1; - num_elems_read_per_iteration = 12; - num_elems_written_per_iteration = 16 >> conv_stride_x; - break; - default: - ARM_COMPUTE_ERROR("Data type not supported."); - break; - } - } - break; - default: - { - ARM_COMPUTE_ERROR("Not implemented"); - break; - } - } - - // Calculate right pad - int start_x = kernel_size / 2 - static_cast(conv_info.pad_left()); - int end_x = ceil_to_multiple(static_cast(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x; - int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width; - - // Calculate border - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); - const unsigned int conv_pad_right = std::max(upper_bound_w, 0); - const unsigned int conv_pad_bottom = conv_info.pad_bottom(); - - border_size.left = conv_pad_left; - border_size.top = conv_pad_top; - border_size.right = conv_pad_right; - border_size.bottom = conv_pad_bottom; - - // Configure window - win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration)); - - AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, - num_elems_read_per_iteration, kernel_size, - conv_stride_x, conv_stride_y); - AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size); - AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration); - window_changed = update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); - } - else - { - // Configure window NHWC without any padding - win = calculate_max_window(*dst, Steps()); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - -bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights) -{ - return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0); -} - -} // namespace - -template -void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst) -{ - // This function assumes that input and weights have not padding in channel - - // Declare useful types - using vtype = wrapper::traits::neon_bitvector; - using vector_type = typename vtype::type; - using tag_type = typename vtype::tag_type; - - // Scalar quantities - const int element_size = src->info()->element_size(); - const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; - const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; - const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; - const int input_dim_w = src->info()->dimension(1); - const int input_dim_h = src->info()->dimension(2); - - const int output_stride_c = dst->info()->strides_in_bytes().x(); - - const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; - const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; - const int kernel_dim_w = weights->info()->dimension(1); - const int kernel_dim_h = weights->info()->dimension(2); - - const int conv_pad_top = _conv_info.pad_top(); - const int conv_pad_left = _conv_info.pad_left(); - const int conv_stride_w = std::get<0>(_conv_info.stride()); - const int conv_stride_h = std::get<1>(_conv_info.stride()); - - // Setup input window for the output iterator - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Setup input window for the weights iterator - Window window_w = calculate_max_window(*weights->info(), Steps()); - window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - Iterator out(dst, window_out); - Iterator wei(weights, window_w); - - constexpr int num_elems_read_per_iteration = 16 / sizeof(T); - /* - * This implementation parallelize the full WC plane of input and weights by - * treating them as series of elements. So for example, a 3x3 weights and - * floating point vector operations of 4 elements per time, the first 3 - * channel elements of the first row would be taken and additionally the first - * element of the second row. The 9 elements in each single WC weight plane - * would require 2 4-element vector operations and a last single element operation. - * - * This works since when we create the input vector to multiply with the weights, - * the exact required elements are loaded in the same order. Therefore the - * multiplication works on the correct input/weight elements. - */ - execute_window_loop(window_out, [&](const Coordinates & id) - { - /* - * In here we create theoretical indexes which then we validate for both - * inputs and weights. - * As a reminder, this loop take each output point in NHW, C is treated - * in the weights loop. - */ - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; - const int index_h_start = in_h_start - in_h_start_t; - const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; - const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - execute_window_loop(window_w, [&](const Coordinates & id_w) - { - /* - * This is the loop in the weights, and it goes along N (the batches) - * As a reminder, the batches of the weights are translated into the - * channels of the output - */ - const T *in_ptr_row = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) - + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; - const T *weights_ptr_row = reinterpret_cast(wei.ptr()) + index_h_start * kernel_stride_h; - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - - T out_temp = static_cast(0); - for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) - { - const T *in_ptr_mover = in_ptr_row; - int index_wc = index_wc_start; - vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); - for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) - { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_row + index_wc); - out_temp += src_val * w_val; - } - } - *(reinterpret_cast(out_ptr)) = out_temp; - }, - wei); - }, - out); -} - -template -void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst) -{ - // Declare useful types - using vtype = wrapper::traits::neon_bitvector; - using vector_type = typename vtype::type; - using tag_type = typename vtype::tag_type; - - // Scalar quantities - const int element_size = src->info()->element_size(); - const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; - const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; - const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; - const int input_dim_w = src->info()->dimension(1); - const int input_dim_h = src->info()->dimension(2); - - const int output_stride_c = dst->info()->strides_in_bytes().x(); - - const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; - const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; - const int kernel_dim_w = weights->info()->dimension(1); - const int kernel_dim_h = weights->info()->dimension(2); - - const int conv_pad_top = _conv_info.pad_top(); - const int conv_pad_left = _conv_info.pad_left(); - const int conv_stride_w = std::get<0>(_conv_info.stride()); - const int conv_stride_h = std::get<1>(_conv_info.stride()); - - // Setup input window for the output iterator - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Setup input window for the weights iterator - Window window_w = calculate_max_window(*weights->info(), Steps()); - window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); - window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - Iterator out(dst, window_out); - Iterator wei(weights, window_w); - - constexpr int num_elems_read_per_iteration = 16 / sizeof(T); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast(id.z()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - const int index_c_end = weights->info()->dimension(0); - const T *const in_ptr_start = reinterpret_cast(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; - - execute_window_loop(window_w, [&](const Coordinates & id_w) - { - const T *const weights_ptr_start = reinterpret_cast(wei.ptr()); - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - - T out_temp = static_cast(0); - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) - { - const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; - const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; - for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) - { - const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; - const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; - int index_c = 0; - vector_type out_temp_vec = wrapper::vdup_n(static_cast(0), tag_type()); - for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - const auto w_vec = wrapper::vloadq(weights_ptr_mover); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) - { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_mover); - out_temp += src_val * w_val; - } - } - } - *(reinterpret_cast(out_ptr)) = out_temp; - }, - wei); - }, - out); -} - -BorderSize CpuDirectConv2dKernel::border_size() const -{ - return _border_size; -} - -void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - - _conv_info = conv_info; - _data_layout = src->data_layout(); - _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)); - - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); - const unsigned int conv_pad_right = conv_info.pad_right(); - const unsigned int conv_pad_bottom = conv_info.pad_bottom(); - if(_data_layout == DataLayout::NCHW) - { - _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left); - } - else - { - _border_size = BorderSize(0); - } - - // Get convolved dimensions - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - - DataType data_type = src->data_type(); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, output_shape, 1, data_type); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info)); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row, - _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICpuKernel::configure(win_config.second); -} - -Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) -{ - unsigned int num_weight_elems_read_per_row = 0; - unsigned int num_elems_read_per_iteration = 0; - unsigned int num_elems_written_per_iteration = 0; - BorderSize border_size = {}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), - weights->clone().get(), - dst->clone().get(), - conv_info, - num_weight_elems_read_per_row, - num_elems_read_per_iteration, - num_elems_written_per_iteration, - border_size) - .first); - - return Status{}; -} - -void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)); - - if(_data_layout == DataLayout::NCHW) - { - switch(kernel_size) - { - case 1: - { - switch(src->info()->data_type()) - { - case DataType::F32: - convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - break; - } - case 3: - { - switch(src->info()->data_type()) - { - case DataType::F32: - convolve_3x3(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - convolve_3x3(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - break; - } - case 5: - { - switch(src->info()->data_type()) - { - case DataType::F32: - convolve_5x5(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - break; - } - default: - { - ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported."); - break; - } - } - } - else - { - switch(src->info()->data_type()) - { - case DataType::F32: - { - if(have_zero_x_internal_padding(src->info(), weights->info())) - { - convolve_nhwc_optimized(window, src, weights, dst); - } - else - { - convolve_nhwc(window, src, weights, dst); - } - break; - } - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - } -} -const char *CpuDirectConv2dKernel::name() const -{ - return "CpuDirectConvolutionLayerKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.h b/src/core/cpu/kernels/CpuDirectConv2dKernel.h deleted file mode 100644 index 9bef1c484a..0000000000 --- a/src/core/cpu/kernels/CpuDirectConv2dKernel.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H -#define ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the kernel to perform Direct Convolution Layer. */ -class CpuDirectConv2dKernel : public ICpuKernel -{ -public: - CpuDirectConv2dKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel); - /** Set the src, weights, and dst tensors. - * - * @note: DirectConvolution only works in the following configurations: - * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 - * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 - * - * @param[in] src The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * The 3rd dimension must be the same as the input's volume 3rd dimension. - * Data type supported:Same as @p input. - * @param[out] dst Output tensor. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32 - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - */ - void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDirectConv2dKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - BorderSize border_size() const override; - -private: - /* Template function for optimized convolution NHWC */ - template - void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst); - - /* Template function for convolution NHWC */ - template - void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst); - - PadStrideInfo _conv_info{}; - BorderSize _border_size{}; - unsigned int _kernel_size{ 0 }; - unsigned int _num_weight_elems_read_per_row{ 0 }; - unsigned int _num_elems_read_per_iteration{ 0 }; - unsigned int _num_elems_written_per_iteration{ 0 }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp deleted file mode 100644 index 662d052941..0000000000 --- a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp +++ /dev/null @@ -1,513 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, - const DirectConvolutionLayerOutputStageKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL))); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - } - - if(src->data_type() == DataType::S32) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output"); - } - - // Checks performed when output is configured - if((dst != nullptr) && (dst->total_size() != 0)) - { - if(is_data_type_float(src->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - else if(src->data_type() == DataType::S32) - { - // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo - ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED)); - } - - return Status{}; -} - -template -typename std::enable_if::value, void>::type -output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) -{ - const bool has_bias = bias != nullptr; - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); - ARM_COMPUTE_UNUSED(result_shift); - ARM_COMPUTE_UNUSED(result_offset_after_shift); - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16 / src->info()->element_size(); - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates & id) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()) + x; - auto v_in = wrapper::vloadq(in_ptr); - - // Accumulate bias - if(has_bias) - { - const auto vb = wrapper::vdup_n(*reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{}); - v_in = wrapper::vadd(v_in, vb); - } - - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - wrapper::vstore(out_ptr, v_in); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - auto s_in = *(reinterpret_cast(in.ptr()) + x); - - // Accumulate bias - if(has_bias) - { - const auto b = *reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))); - s_in += b; - } - - *(reinterpret_cast(out.ptr()) + x) = s_in; - } - - }, - in, out); -} - -template -typename std::enable_if::value, void>::type -output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) -{ - const bool has_bias = bias != nullptr; - ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); - ARM_COMPUTE_UNUSED(result_shift); - ARM_COMPUTE_UNUSED(result_offset_after_shift); - - Window window_bias = window; - window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); - window_bias.set(3, Window::Dimension(0, 0, 0)); - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16 / src->info()->element_size(); - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator bi(bias, window_bias); - Iterator out(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()); - auto v_in = wrapper::vloadq(in_ptr + x); - - // Accumulate bias - if(has_bias) - { - const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; - v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr)); - } - - const auto out_ptr = reinterpret_cast(out.ptr()); - wrapper::vstore(out_ptr + x, v_in); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - auto s_in = *(reinterpret_cast(in.ptr()) + x); - - // Accumulate bias - if(has_bias) - { - const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; - s_in += *bias_ptr; - } - - const auto out_ptr = reinterpret_cast(out.ptr()); - *(out_ptr + x) = s_in; - } - }, - in, bi, out); -} - -// Quantized case -template < typename TOut, typename std::enable_if < std::is_same::value || std::is_same::value, int >::type = 0 > -void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) -{ - const bool has_bias = bias != nullptr; - using VectorType = typename wrapper::traits::neon_bitvector_t; - using TagType = typename wrapper::traits::neon_bitvector_tag_t; - - const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); - - const VectorType min = wrapper::vdup_n(std::numeric_limits::lowest(), TagType{}); - const VectorType max = wrapper::vdup_n(std::numeric_limits::max(), TagType{}); - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16 / src->info()->element_size(); - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator out(dst, win); - - execute_window_loop(win, [&](const Coordinates & id) - { - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()) + x; - int32x4x4_t v_in = - { - { - wrapper::vloadq(in_ptr), - wrapper::vloadq(in_ptr + 4), - wrapper::vloadq(in_ptr + 8), - wrapper::vloadq(in_ptr + 12) - } - }; - - // Accumulate bias - if(has_bias) - { - const auto vb = wrapper::vdup_n(*reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))), TagType{}); - v_in = - { - { - wrapper::vadd(v_in.val[0], vb), - wrapper::vadd(v_in.val[1], vb), - wrapper::vadd(v_in.val[2], vb), - wrapper::vadd(v_in.val[3], vb) - } - }; - } - - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, - min, max, false)); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - int32_t s_in = *(reinterpret_cast(in.ptr()) + x); - - // Accumulate bias - if(has_bias) - { - const auto b = *reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))); - s_in += b; - } - - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, - std::numeric_limits::lowest(), std::numeric_limits::max(), false); - } - }, - in, out); -} -template < typename TOut, typename std::enable_if < std::is_same::value || std::is_same::value, int >::type = 0 > -void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) -{ - const bool has_bias = bias != nullptr; - using VectorType = typename wrapper::traits::neon_bitvector_t; - using TagType = typename wrapper::traits::neon_bitvector_tag_t; - - const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); - - const VectorType min = wrapper::vdup_n(std::numeric_limits::lowest(), TagType{}); - const VectorType max = wrapper::vdup_n(std::numeric_limits::max(), TagType{}); - - Window window_bias = window; - window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); - window_bias.set(3, Window::Dimension(0, 0, 0)); - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16 / src->info()->element_size(); - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator bi(bias, window_bias); - Iterator out(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()) + x; - int32x4x4_t v_in = - { - { - wrapper::vloadq(in_ptr), - wrapper::vloadq(in_ptr + 4), - wrapper::vloadq(in_ptr + 8), - wrapper::vloadq(in_ptr + 12), - } - }; - - // Accumulate bias - if(has_bias) - { - const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; - - wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr)); - wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4)); - wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8)); - wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12)); - } - - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false)); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast(in.ptr()) + x; - int32_t s_in = *in_ptr; - - // Accumulate bias - if(has_bias) - { - const auto bias_ptr = reinterpret_cast(bi.ptr()) + x; - s_in += *bias_ptr; - } - - const auto out_ptr = reinterpret_cast(out.ptr()) + x; - *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, - std::numeric_limits::lowest(), std::numeric_limits::max(), false); - } - }, - in, bi, out); -} -} // namespace - -void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, - const DirectConvolutionLayerOutputStageKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(bias); - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(src); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); - - _func = nullptr; - _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier; - _result_shift = info.result_shift; - _result_offset_after_shift = info.result_offset_after_shift; - - // Auto-initialize output output if required - if(dst != nullptr) - { - // Work out expected output data type - const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32; - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt)); - } - - Window win = calculate_max_window(*src, Steps()); - - ICpuKernel::configure(win); - - const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false; - - // Set appropriate function - if(src->data_layout() == DataLayout::NCHW) - { - switch(src->data_type()) - { - case DataType::S32: - { - if(is_qasymm8_signed) - { - _func = &output_stage_nchw; - } - else - { - _func = &output_stage_nchw; - } - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - _func = &output_stage_nchw; - break; - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - { - _func = &output_stage_nchw; - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); - } - } - } - else - { - switch(src->data_type()) - { - case DataType::S32: - { - if(is_qasymm8_signed) - { - _func = &output_stage_nhwc; - } - else - { - _func = &output_stage_nhwc; - } - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - _func = &output_stage_nhwc; - break; - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - { - _func = &output_stage_nhwc; - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); - } - } - } -} - -Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, - const DirectConvolutionLayerOutputStageKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); - return Status{}; -} - -void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift); -} - -const char *CpuDirectConv2dOutputStageKernel::name() const -{ - return "CpuDirectConv2dOutputStageKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h deleted file mode 100644 index 749411c0a7..0000000000 --- a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H -#define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input. - * - * @note We assume bias to be shared - * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part - * of the @ref DirectConvolutionLayerOutputStageKernelInfo. - */ -class CpuDirectConv2dOutputStageKernel : public ICpuKernel -{ -public: - CpuDirectConv2dOutputStageKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel); - /** Set the accumulate buffer and the biases of the kernel. - * - * @param[in, out] src Input to add the bias to. If @p dst is not specified then accumulation is done in-place. - * Data type supported: F16/F32/S32 - * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src - * @param[out] dst (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) - * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr. - * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32 - * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata - */ - void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr, - const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDirectConv2dOutputStageKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr, - const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift); - - OutputStageKernel *_func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _result_offset_after_shift{ 0 }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp deleted file mode 100644 index dc574fce65..0000000000 --- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp +++ /dev/null @@ -1,454 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuElementwiseKernel.h" - -#include "arm_compute/core/Helpers.h" -#include "src/core/CPP/Validate.h" -#include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h" -#include "src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h" -#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" -#include "src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -struct ElementwiseSelectorData -{ - DataType dt; - const CPUInfo &ci; -}; - -using ElementwiseSelector = std::add_pointer::type; -using UKernelType = CpuElementwiseKernel::ElementwiseFunction; -struct ElementwiseKernel -{ - const char *name; - const ElementwiseSelector is_selected; - UKernelType *ukernel; -}; - -template -CpuElementwiseKernel::UKernelInfo configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - ARM_COMPUTE_UNUSED(src1, dst); - static ElementwiseKernel kernels[] = - { -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp32_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op)) - }, - { - "sve_s32_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op)) - }, - { - "sve_s16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_fp32_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op>)) - }, - { - "neon_s32_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op>)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ -#if defined(ARM_COMPUTE_ENABLE_SVE2) - { - "sve2_qu8_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op)) - }, - { - "sve2_qs8_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) - { - "neon_qu8_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized)) - }, - { - "neon_qs8_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, - REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op>)) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_s16_elementwise", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op>)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ - }; - - for(const auto &uk : kernels) - { - if(uk.is_selected({ src0->data_type(), CPUInfo::get() })) - { - return { uk.name, uk.ukernel }; - } - } - - return { "", nullptr }; -} - -template -CpuElementwiseKernel::UKernelInfo configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - ARM_COMPUTE_UNUSED(src1, dst); - static ElementwiseKernel kernels[] = - { -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_u8_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op)) - }, - { - "sve_fp32_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op)) - }, - { - "sve_s16_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op)) - }, - { - "sve_s32_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_u8_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8)) - }, - { - "neon_fp32_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32)) - }, - { - "neon_s16_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16)) - }, - { - "neon_s32_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ -#if defined(ARM_COMPUTE_ENABLE_SVE2) - { - "sve2_qu8_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op)) - }, - { - "sve2_qs8_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) - { - "neon_qu8_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized)) - }, - { - "neon_qs8_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp16_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_comparison", - [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, - REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16)) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - }; - - for(const auto &uk : kernels) - { - if(uk.is_selected({ src0->data_type(), CPUInfo::get() })) - { - return { uk.name, uk.ukernel }; - } - } - - return { "", nullptr }; -} -} // namespace - -Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); - - const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - // Validate in case of configured dst - if(dst.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), - "Wrong shape for output"); - } - - return Status{}; -} - -void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - - const auto uk = get_implementation(src0, src1, dst); - - _run_method = uk.ukernel; - _name = std::string("CpuElementwiseKernel").append("/").append(uk.name); - - // If any of shapes is dynamic, expect a configured window and dst at run-time. - if(src0->is_dynamic() || src1->is_dynamic()) - { - return; - } - - auto shape_and_window = compute_output_shape_and_window(src0->tensor_shape(), src1->tensor_shape()); - auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type()); - ICpuKernel::configure(shape_and_window.second); -} - -void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - _run_method(src0, src1, dst, window); -} - -const char *CpuElementwiseKernel::name() const -{ - return _name.c_str(); -} - -/** Arithmetic operators (min, max, squared_diff) */ -void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); - _op = op; - configure_common(src0, src1, dst); -} - -Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); - // Validate in case of configured dst - if(dst.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); - } - return validate_arguments_common(src0, src1, dst); -} - -Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - ARM_COMPUTE_UNUSED(op); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); - return Status{}; -} - -CpuElementwiseKernel::UKernelInfo CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - switch(_op) - { - case ArithmeticOperation::MAX: - return configure_arithm_func(src0, src1, dst); - case ArithmeticOperation::MIN: - return configure_arithm_func(src0, src1, dst); - case ArithmeticOperation::SQUARED_DIFF: - return configure_arithm_func(src0, src1, dst); - case ArithmeticOperation::PRELU: - return configure_arithm_func(src0, src1, dst); - case ArithmeticOperation::DIV: - return configure_arithm_func(src0, src1, dst); - case ArithmeticOperation::POWER: - return configure_arithm_func(src0, src1, dst); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return { "", nullptr }; -} - -/** The division operator */ - -void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); - _op = ArithmeticOperation::DIV; - configure_common(src0, src1, dst); -} - -Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32); - return CpuArithmeticKernel::validate_arguments(src0, src1, dst); -} - -Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); - return Status{}; -} - -/** The power operator */ -void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); - _op = ArithmeticOperation::POWER; - configure_common(src0, src1, dst); -} - -Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::F16, DataType::F32); - return CpuArithmeticKernel::validate_arguments(src0, src1, dst); -} - -Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); - return Status{}; -} - -/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */ -void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); - _op = op; - configure_common(src0, src1, dst); -} - -Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); - // Validate in case of configured dst - if(dst.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8); - } - return validate_arguments_common(src0, src1, dst); -} - -Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - ARM_COMPUTE_UNUSED(op); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); - return Status{}; -} - -CpuElementwiseKernel::UKernelInfo CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - switch(_op) - { - case ComparisonOperation::Equal: - return configure_comp_func(src0, src1, dst); - case ComparisonOperation::NotEqual: - return configure_comp_func(src0, src1, dst); - case ComparisonOperation::Greater: - return configure_comp_func(src0, src1, dst); - case ComparisonOperation::GreaterEqual: - return configure_comp_func(src0, src1, dst); - case ComparisonOperation::Less: - return configure_comp_func(src0, src1, dst); - case ComparisonOperation::LessEqual: - return configure_comp_func(src0, src1, dst); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return { "", nullptr }; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.h b/src/core/cpu/kernels/CpuElementwiseKernel.h deleted file mode 100644 index 75137da65d..0000000000 --- a/src/core/cpu/kernels/CpuElementwiseKernel.h +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H -#define ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for an element-wise operation kernel - * - * Element-wise operation is computed by: - * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f] - * - */ -class CpuElementwiseKernel : public ICpuKernel -{ -public: - CpuElementwiseKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel); - - using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &); - struct UKernelInfo - { - std::string name; - std::function ukernel; - }; - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -protected: - /** Validate the argument passed to the kernel - * - * @param[in] src0 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32. - * @param[in] src1 Second tensor input. Data types supported: Same as @p src0. - * @param[in] dst Output tensor. Data types supported: Dependent on subclass. - */ - static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); - - /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff) - * - */ - void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - - /** Function to get the micro kernel implementation - * - * @param[in] src0 First input tensor information - * @param[in] src1 Second input tensor information - * @param[in] dst Output tensor information - * - * @return the function instance for the micro kernel - */ - virtual UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0; - -protected: - std::function _run_method{ nullptr }; - std::string _name{}; -}; - -class CpuArithmeticKernel : public CpuElementwiseKernel -{ -public: - CpuArithmeticKernel() = default; - - /** Configure kernel - * - * @param[in] op Arithmetic operation to be executed. - * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: Same as @p src0. - */ - void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuArithmeticKernel::configure() - * - * @return a status - */ - static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); - -protected: - // Inherited methods overridden: - static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); - - ArithmeticOperation _op{}; - -private: - /** Function to get the micro kernel implementation - * - * @param[in] src0 First input tensor information - * @param[in] src1 Second input tensor information - * @param[in] dst Output tensor information - * - * @return the function instance for the micro kernel - */ - UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override; -}; - -class CpuDivisionKernel : public CpuArithmeticKernel -{ -public: - CpuDivisionKernel() = default; - - /** Configure kernel - * - * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: Same as @p src0. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDivisionKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); - -protected: - // Inherited methods overridden: - static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); -}; - -class CpuPowerKernel : public CpuArithmeticKernel -{ -public: - CpuPowerKernel() = default; - - /** Configure kernel - * - * @param[in] src0 First tensor input info. Data types supported: F16/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: Same as @p src0. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuPowerKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); - -protected: - // Inherited methods overridden: - static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); -}; - -class CpuComparisonKernel : public CpuElementwiseKernel -{ -public: - CpuComparisonKernel() = default; - - /** Configure kernel - * - * @param[in] op Comparison operation to be executed. - * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: U8. - */ - void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuComparisonKernel::configure() - * - * @return a status - */ - static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); - -protected: - // Inherited methods overridden: - static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); - -private: - /** Function to get the micro kernel implementation - * - * @param[in] src0 First input tensor information - * @param[in] src1 Second input tensor information - * @param[in] dst Output tensor information - * - * @return the function instance for the micro kernel - */ - UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override; - - ComparisonOperation _op{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */ \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp deleted file mode 100644 index b03c32f023..0000000000 --- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuElementwiseUnaryKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h" -#include "src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/ToolchainSupport.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -struct ElementwiseUnarySelectorData -{ - DataType dt; - const CPUInfo &ci; -}; -using ElementwiseUnarySelector = std::add_pointer::type; - -struct ElementwiseUnaryKernel -{ - const char *name; - const ElementwiseUnarySelector is_selected; - CpuElementwiseUnaryKernel::ElementwiseUnaryUkernelPtr ukernel; -}; - -static const ElementwiseUnaryKernel available_kernels[] = -{ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp32_elementwise_unary", - [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_SVE(arm_compute::cpu::elementwise_sve_op), - }, - { - "sve_fp16_elementwise_unary", - [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_SVE(arm_compute::cpu::elementwise_sve_op<__fp16>), - }, - { - "sve_s32_elementwise_unary", - [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op), - }, -#endif // defined(ARM_COMPUTE_ENABLE_SVE) -#if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_fp32_elementwise_unary", - [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op), - }, -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_elementwise_unary", - [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<__fp16>), - }, -#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_s32_elementwise_unary", - [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op), - }, -#endif // defined(ARM_COMPUTE_ENABLE_NEON) -}; - -const ElementwiseUnaryKernel *get_implementation(DataType dt) -{ - for(const auto &uk : available_kernels) - { - if(uk.is_selected({ dt, CPUInfo::get() })) - { - return &uk; - } - } - return nullptr; -} -} // namespace - -void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst)); - const auto uk = get_implementation(src.data_type()); - ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - _op = op; - _run_method = uk->ukernel; - _name = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name); - - // If input shape is dynamic, expect a configured window and dst at run-time. - if(src.is_dynamic()) - { - return; - } - - auto shape_and_window = compute_output_shape_and_window(src.tensor_shape()); - auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type()); - ICpuKernel::configure(shape_and_window.second); -} - -Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); - - const auto *uk = get_implementation(src.data_type()); - ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - switch(op) - { - case ElementWiseUnary::EXP: - case ElementWiseUnary::RSQRT: - case ElementWiseUnary::LOG: - case ElementWiseUnary::ROUND: - case ElementWiseUnary::SIN: - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32); - break; - case ElementWiseUnary::NEG: - case ElementWiseUnary::ABS: - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32); - break; - default: - ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported"); - } - // Validate in case of configured dst - if(dst.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); - } - - return Status{}; -} - -void CpuElementwiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - _run_method(src, dst, window, _op); -} - -const char *CpuElementwiseUnaryKernel::name() const -{ - return _name.c_str(); -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h deleted file mode 100644 index bda65a35e0..0000000000 --- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H -#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for an element-wise unary operation kernel - * - * Element-wise operation is computed by: - * @f[ dst(x) = OP(src(x))@f] - */ -class CpuElementwiseUnaryKernel : public ICpuKernel -{ -public: - CpuElementwiseUnaryKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseUnaryKernel); - - /** Function to configure the @ref CpuElementwiseUnaryKernel - * - * @param[in] op Arithmetic operation to be executed. - * @param[in] src First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations. - * @param[out] dst Output tensor. Data types supported: Same as @p src. - */ - void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuElementwiseUnaryKernel::configure() - * - * @return a status - */ - static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - - /** Common signature for all the specialised elementwise unary micro-kernels - * - * @param[in] window Region on which to execute the kernel. - */ - using ElementwiseUnaryUkernelPtr = std::add_pointer::type; - -private: - ElementWiseUnary _op{}; - ElementwiseUnaryUkernelPtr _run_method{ nullptr }; - std::string _name{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuFillKernel.cpp b/src/core/cpu/kernels/CpuFillKernel.cpp deleted file mode 100644 index aab4d715ee..0000000000 --- a/src/core/cpu/kernels/CpuFillKernel.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuFillKernel.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -void CpuFillKernel::configure(const ITensorInfo *tensor, const PixelValue &constant_value) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); - _constant_value = constant_value; - - // Configure kernel window - Window win = calculate_max_window(*tensor, Steps()); - ICpuKernel::configure(win); -} - -void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto inout = tensors.get_tensor(TensorType::ACL_SRC_DST); - - // Collapse all the batches on the third dimension - bool has_collapsed = true; - Window collapsed = window.collapse_if_possible(window, Window::DimZ, &has_collapsed); - ARM_COMPUTE_ERROR_ON(!has_collapsed); - - uint8_t *const start_valid_region = inout->ptr_to_element(inout->info()->valid_region().anchor); - const auto window_width = static_cast(collapsed.x().end()) - static_cast(collapsed.x().start()); - const size_t element_size = inout->info()->element_size(); - - // Unroll X dimension - collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator tensor_it(inout, collapsed); - execute_window_loop(collapsed, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + tensor_it.offset(); - // Set memory - for(int i = 0; i < window_width; ++i) - { - std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size); - } - - }, - tensor_it); -} - -const char *CpuFillKernel::name() const -{ - return "CpuFillKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuFillKernel.h b/src/core/cpu/kernels/CpuFillKernel.h deleted file mode 100644 index 9afdee4186..0000000000 --- a/src/core/cpu/kernels/CpuFillKernel.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_FILL_KERNEL_H -#define ARM_COMPUTE_CPU_FILL_KERNEL_H - -#include "arm_compute/core/PixelValue.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel for filling a tensor with a given constant value */ -class CpuFillKernel : public ICpuKernel -{ -public: - CpuFillKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFillKernel); - /** Configure kernel for a given list of arguments - * - * @param[in,out] tensor Tensor to fill. Supported data types: All - * @param[in] constant_value The value used to fill the planes of the tensor - */ - void configure(const ITensorInfo *tensor, const PixelValue &constant_value); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - PixelValue _constant_value{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_FILL_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuFloorKernel.cpp b/src/core/cpu/kernels/CpuFloorKernel.cpp deleted file mode 100644 index d41df6a1f5..0000000000 --- a/src/core/cpu/kernels/CpuFloorKernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuFloorKernel.h" - -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/floor/list.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -struct FloorSelectorData -{ - DataType dt; -}; - -using FloorSelectorPtr = std::add_pointer::type; -using FloorUKernelPtr = std::add_pointer::type; - -struct FloorUKernel -{ - const char *name; - const FloorSelectorPtr is_selected; - FloorUKernelPtr ukernel; -}; - -static const FloorUKernel available_kernels[] = -{ - { - "neon_fp16_floor", - [](const FloorSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor) - }, - { - "neon_fp32_floor", - [](const FloorSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor) - }, -}; - -/** Micro-kernel selector - * - * @param[in] data Selection data passed to help pick the appropriate micro-kernel - * - * @return A matching micro-kernel else nullptr - */ -const FloorUKernel *get_implementation(const FloorSelectorData &data) -{ - for(const auto &uk : available_kernels) - { - if(uk.is_selected(data)) - { - return &uk; - } - } - return nullptr; -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - - const auto *uk = get_implementation(FloorSelectorData{ src->data_type() }); - ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - // Validate in case of configured output - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} -} // namespace - -void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - - auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type()); - - const auto *uk = get_implementation(FloorSelectorData{ src->data_type() }); - ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - - _run_method = uk->ukernel; - _name = std::string("CpuFloorKernel").append("/").append(uk->name); - - // Configure kernel window - const Window win = calculate_max_window(*src, Steps()); - - ICPPKernel::configure(win); -} - -Window CpuFloorKernel::infer_window(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_UNUSED(dst); - ARM_COMPUTE_ERROR_ON(!bool(validate_arguments(src, dst))); - - Window win; - win.use_tensor_dimensions(src->tensor_shape()); - return win; -} - -Status CpuFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - return Status{}; -} - -void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - ARM_COMPUTE_ERROR_ON(tensors.empty()); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - const auto len = static_cast(window.x().end()) - static_cast(window.x().start()); - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator src_it(src, win); - Iterator dst_it(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - _run_method(src_it.ptr(), dst_it.ptr(), len); - }, - src_it, dst_it); -} - -const char *CpuFloorKernel::name() const -{ - return _name.c_str(); -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuFloorKernel.h b/src/core/cpu/kernels/CpuFloorKernel.h deleted file mode 100644 index 78534d2a1d..0000000000 --- a/src/core/cpu/kernels/CpuFloorKernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_FLOOR_KERNEL_H -#define ARM_COMPUTE_CPU_FLOOR_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Cpu accelarated kernel to perform a floor operation */ -class CpuFloorKernel : public ICpuKernel -{ -public: - CpuFloorKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFloorKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] src Source tensor. Data type supported: F16/F32. - * @param[out] dst Destination tensor. Same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuFloorKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - /** Infer execution window - * - * @param[in] src Source tensor info. Data type supported: F16/F32. - * @param[in] dst Destination tensor info. Same as @p src - * - * @return an execution Window - */ - Window infer_window(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - using FloorUKernelPtr = std::add_pointer::type; - -private: - FloorUKernelPtr _run_method{ nullptr }; - std::string _name{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_FLOOR_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp deleted file mode 100644 index a6b080c0ab..0000000000 --- a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -using namespace arm_compute::misc::shape_calculator; - -void CpuGemmInterleave4x4Kernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // dst auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_interleaved_shape(*src))); - - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmInterleave4x4Kernel::validate(src, dst)); - - Window win = calculate_max_window(*src, Steps(1, 4)); - ICPPKernel::configure(win); -} - -Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - - if(dst->total_size() != 0) - { - const TensorShape dst_shape = compute_interleaved_shape(*src); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - } - - return Status{}; -} - -void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - ARM_COMPUTE_ERROR_ON(tensors.empty()); - /* - * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) - * |a00 a01 a02 a03| - * |a10 a11 a12 a13| - * |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | - * |a30 a31 a32 a33| - * - * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] - */ - const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - - const size_t window_start_x = window.x().start(); - const size_t window_end_x = window.x().end(); - - const size_t in_height = src->info()->dimension(1); - const size_t in_stride = src->info()->strides_in_bytes()[1]; - - const size_t partial_y = in_height % 4; - - const size_t element_size = src->info()->element_size(); - - // Set window for the src tensor - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Set window for the dst tensor - Window win_out(window); - win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_out.scale(Window::DimY, 0.25f); - - Iterator in(src, win); - Iterator out(dst, win_out); - - execute_window_loop(win, [&](const Coordinates & id) - { - if(id.y() + 4 <= static_cast(in_height)) - { - for(size_t x = window_start_x; x < window_end_x; ++x) - { - std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size); - std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size); - std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size); - std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size); - } - } - else - { - for(size_t x = window_start_x; x < window_end_x; ++x) - { - size_t y = 0; - for(; y < partial_y; ++y) - { - std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size); - } - for(; y < 4; ++y) - { - std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size); - } - } - } - }, - in, out); -} - -const char *CpuGemmInterleave4x4Kernel::name() const -{ - return "CpuGemmInterleave4x4Kernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h deleted file mode 100644 index 0c55886d8d..0000000000 --- a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H -#define ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to interleave the elements of a matrix - * - * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) - * - * @f[ - * \left( \begin{array}{cccc} - * a00 & a01 & a02 & a03 \\ - * a10 & a11 & a12 & a13 \\ - * a20 & a21 & a22 & a23 \\ - * a30 & a31 & a32 & a33 \\ - * \end{array} \right) - * \rightarrow - * \left( \begin{array}{ccccccccccccccccc} - * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\ - * \end{array} \right) - * @f] - * - * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] - */ -class CpuGemmInterleave4x4Kernel : public ICpuKernel -{ -public: - CpuGemmInterleave4x4Kernel() = default; - /** Initialise the kernel's src and dst. - * - * @param[in] src Input tensor info. Data types supported: All - * @param[out] dst Output tensor info which stores the interleaved matrix. Data type supported: same as @p src. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmInterleave4x4Kernel - * - * Similar to @ref CpuGemmInterleave4x4Kernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp deleted file mode 100644 index 35e542faa4..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp +++ /dev/null @@ -1,1053 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window) -{ - execute_window_loop(window, [&](const Coordinates & id) - { - if(id.x() > width_b) - { - return; - } - - // Note: Since the input are all positives, we can use uint32_t - // Accumulators for the block 0 - uint32x4x4_t c0 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; - - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()); - auto vec_a_end_addr = vec_a + width_a; - - // This for loop performs 8 accumulations - for(; vec_a <= (vec_a_end_addr - 8);) - { - const uint8x8_t a00_u8 = vld1_u8(vec_a); - const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b); - const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b); - const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b); - const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b); - const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b); - const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b); - const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b); - const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b); - - // Convert a00_u8 to uint16_t and get the lower part - const uint16x4x2_t a00_u16 = - { - { - vget_low_u16(vmovl_u8(a00_u8)), - vget_high_u16(vmovl_u8(a00_u8)) - } - }; - - const uint16x4x4_t b00_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) - } - }; - - const uint16x4x4_t b10_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b10_u8))) - } - }; - - const uint16x4x4_t b20_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b20_u8))) - } - }; - - const uint16x4x4_t b30_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b30_u8))) - } - }; - - const uint16x4x4_t b40_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b40_u8))) - } - }; - - const uint16x4x4_t b50_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b50_u8))) - } - }; - - const uint16x4x4_t b60_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b60_u8))) - } - }; - - const uint16x4x4_t b70_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b70_u8))) - } - }; - - // Accumulate 0: - c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0); - - // Accumulate 1: - c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1); - c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1); - c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1); - c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1); - - // Accumulate 2: - c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2); - c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2); - c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2); - c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2); - - // Accumulate 3: - c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3); - c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3); - c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3); - c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3); - - // Accumulate 4: - c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0); - - // Accumulate 5: - c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1); - c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1); - c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1); - c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1); - - // Accumulate 6: - c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2); - c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2); - c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2); - c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2); - - // Accumulate 7: - c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3); - c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3); - c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3); - c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3); - - vec_a += 8; - matrix_b += 8 * stride_b; - } - - // This for loop performs the left-over accumulations - for(; vec_a < vec_a_end_addr;) - { - const uint8x8_t a00_u8 = vld1_dup_u8(vec_a); - const uint8x16_t b00_u8 = vld1q_u8(matrix_b); - - const uint16x4x4_t b00_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) - } - }; - - // Convert a00_u8 to uint16_t and get the lower part - const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); - - // Accumulate 0: - c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); - - vec_a += 1; - matrix_b += stride_b; - } - - auto vec_out = reinterpret_cast(out.ptr()); - if(id.x() < (width_out - 16)) - { - vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0])); - vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1])); - vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2])); - vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3])); - } - else - { - auto left_over = width_out - id.x(); - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(vec_out + k * 4 + j) = c0.val[k][j]; - } - } - } - }, - ina, inb, out); -} - -void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window) -{ - execute_window_loop(window, [&](const Coordinates & id) - { - if(id.x() > width_b) - { - return; - } - - // Accumulators for the block 0 - int32x4x4_t c0 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()); - auto vec_a_end_addr = vec_a + width_a; - - // This for loop performs 8 accumulations - for(; vec_a <= (vec_a_end_addr - 8);) - { - const int8x8_t a00_s8 = vld1_s8(vec_a); - const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b); - const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b); - const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b); - const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b); - const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b); - const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b); - const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b); - const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b); - - // Convert a00_s8 to int16_t and get the lower part - const int16x4x2_t a00_s16 = - { - { - vget_low_s16(vmovl_s8(a00_s8)), - vget_high_s16(vmovl_s8(a00_s8)) - } - }; - - const int16x4x4_t b00_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) - } - }; - - const int16x4x4_t b10_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b10_s8))) - } - }; - - const int16x4x4_t b20_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b20_s8))) - } - }; - - const int16x4x4_t b30_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b30_s8))) - } - }; - - const int16x4x4_t b40_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b40_s8))) - } - }; - - const int16x4x4_t b50_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b50_s8))) - } - }; - - const int16x4x4_t b60_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b60_s8))) - } - }; - - const int16x4x4_t b70_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b70_s8))) - } - }; - - // Accumulate 0: - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0); - - // Accumulate 1: - c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1); - c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1); - c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1); - c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1); - - // Accumulate 2: - c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2); - c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2); - c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2); - c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2); - - // Accumulate 3: - c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3); - c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3); - c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3); - c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3); - - // Accumulate 4: - c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0); - - // Accumulate 5: - c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1); - c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1); - c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1); - c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1); - - // Accumulate 6: - c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2); - c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2); - c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2); - c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2); - - // Accumulate 7: - c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3); - c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3); - c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3); - c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3); - - vec_a += 8; - matrix_b += 8 * stride_b; - } - - // This for loop performs the left-over accumulations - for(; vec_a < vec_a_end_addr;) - { - const int8x8_t a00_s8 = vld1_dup_s8(vec_a); - const int8x16_t b00_s8 = vld1q_s8(matrix_b); - - const int16x4x4_t b00_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) - } - }; - - // Convert a00_s8 to uint16_t and get the lower part - const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); - - // Accumulate 0: - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); - - vec_a += 1; - matrix_b += stride_b; - } - - auto vec_out = reinterpret_cast(out.ptr()); - if(id.x() < (width_out - 16)) - { - vst1q_s32(vec_out + 0, c0.val[0]); - vst1q_s32(vec_out + 4, c0.val[1]); - vst1q_s32(vec_out + 8, c0.val[2]); - vst1q_s32(vec_out + 12, c0.val[3]); - } - else - { - auto left_over = width_out - id.x(); - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(vec_out + k * 4 + j) = c0.val[k][j]; - } - } - } - }, - ina, inb, out); -} - -void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) -{ - const auto width_out = static_cast(out_info.dimension(0)); - const auto height_out = static_cast(out_info.dimension(1)); - const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size(); - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8_t *mtx_a0 = ina.ptr(); - const uint8_t *mtx_b0 = inb.ptr(); - - // Note: Since the input are all positives, we can use uint32_t - // Accumulators for the block 0 - uint32x4x4_t c0 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; - - // Accumulators for the block 1 - uint32x4x4_t c1 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; - - // Accumulators for the block 2 - uint32x4x4_t c2 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; - - // Accumulators for the block 3 - uint32x4x4_t c3 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; - - for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) - { - const uint8x8_t a00_u8 = vld1_u8(mtx_a0); - const uint8x16_t b00_u8 = vld1q_u8(mtx_b0); - - // Convert a00_u8 to uint16_t and get the lower part - const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); - - // Convert b00_s8 to uint16_t - const uint16x4x4_t b00_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) - } - }; - - // 4x4 block 0 - c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); - - // 4x4 block 1 - c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1); - c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1); - c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1); - c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1); - - // 4x4 block 2 - c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2); - c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2); - c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2); - c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2); - - // 4x4 block 3 - c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3); - c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3); - c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3); - c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3); - } - - auto mtx_out = reinterpret_cast(out.ptr()); - - if(id.y() < height_out && id.x() < (width_out - 16)) - { - vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0])); - vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1])); - vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2])); - vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3])); - if(id.y() + 1 < height_out) - { - vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0])); - vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1])); - vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2])); - vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3])); - if(id.y() + 2 < height_out) - { - vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0])); - vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1])); - vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2])); - vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3])); - if(id.y() + 3 < height_out) - { - vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0])); - vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1])); - vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2])); - vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3])); - } - } - } - } - else - { - const auto left_over_value = width_out - id.x(); - auto left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + k * 4 + j) = c0.val[k][j]; - } - } - if(id.y() + 1 < height_out) - { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; - } - } - if(id.y() + 2 < height_out) - { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; - } - } - if(id.y() + 3 < height_out) - { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; - } - } - } - } - } - } - }, - ina, inb, out); -} - -void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) -{ - const auto width_out = static_cast(out_info.dimension(0)); - const auto height_out = static_cast(out_info.dimension(1)); - const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size(); - // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW - // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration - // All the values needed for computing a single 4x4 block will be read from consecutive memory positions - execute_window_loop(window, [&](const Coordinates & id) - { - auto *mtx_a0 = reinterpret_cast(ina.ptr()); - auto *mtx_b0 = reinterpret_cast(inb.ptr()); - - // Note: Since the input are all positives, we can use uint32_t - // Accumulators for the block 0 - int32x4x4_t c0 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - // Accumulators for the block 1 - int32x4x4_t c1 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - // Accumulators for the block 2 - int32x4x4_t c2 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - // Accumulators for the block 3 - int32x4x4_t c3 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) - { - const int8x8_t a00_s8 = vld1_s8(mtx_a0); - const int8x16_t b00_s8 = vld1q_s8(mtx_b0); - - // Convert a00_s8 to uint16_t and get the lower part - const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); - - // Convert b00_s8 to int16_t - const int16x4x4_t b00_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) - } - }; - - // 4x4 block 0 - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); - - // 4x4 block 1 - c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1); - c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1); - c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1); - c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1); - - // 4x4 block 2 - c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2); - c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2); - c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2); - c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2); - - // 4x4 block 3 - c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3); - c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3); - c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3); - c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3); - } - auto mtx_out = reinterpret_cast(out.ptr()); - if(id.y() < height_out && id.x() < (width_out - 16)) - { - vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]); - vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]); - vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]); - vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]); - if(id.y() + 1 < height_out) - { - vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]); - vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]); - vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]); - vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]); - if(id.y() + 2 < height_out) - { - vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]); - vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]); - vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]); - vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]); - if(id.y() + 3 < height_out) - { - vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]); - vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]); - vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]); - vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]); - } - } - } - } - else if(id.y() < height_out) - { - const auto left_over_value = width_out - id.x(); - auto left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + k * 4 + j) = c0.val[k][j]; - } - } - if(id.y() + 1 < height_out) - { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; - } - } - if(id.y() + 2 < height_out) - { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; - } - } - if(id.y() + 3 < height_out) - { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; - } - } - } - } - } - } - - }, - ina, inb, out); -} - -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - - TensorShape in0_shape = src0->tensor_shape(); - TensorShape in1_shape = src1->tensor_shape(); - TensorShape out_shape = dst->tensor_shape(); - - // Check vector-by-matrix case - if(out_shape[1] == 1) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows"); - } - else - { - in0_shape.collapse(2); - in1_shape.collapse(2); - out_shape.collapse(2); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16"); - } - - return Status{}; -} -} // namespace - -void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - ARM_COMPUTE_UNUSED(src0); - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst)); - - TensorShape in1_shape = src1->tensor_shape(); - in1_shape.collapse(2); - - _slide_matrix_b = in1_shape[2] != 1; - - constexpr unsigned int num_elems_processed_per_iteration_x = 16; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - - Window win; - // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication - if((dst->dimension(1) == 1)) - { - // Configure kernel window - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x)); - } - else - { - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - } - - ICpuKernel::configure(win); -} - -Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst)); - return Status{}; -} - -void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path - if((dst->info()->dimension(1) == 1)) - { - const auto width_matrix_a = static_cast(src0->info()->dimension(0)); - const auto width_matrix_b = static_cast(src1->info()->dimension(0)); - const auto width_out = static_cast(dst->info()->dimension(0)); - const auto in_b_stride = static_cast(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type())); - - // The implementation computes 16 elements per iteration - const int window_start_x = 16 * info.thread_id; - const int window_step_x = 16 * info.num_threads; - // Make sure (window_end_x - window_start_x) is a multiple of window_step_x - const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - - Window win_out(window); - win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); - win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(src1->info()->num_dimensions() >= 3) - { - win_b = window; - } - win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); - win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator ina(src0, win_a); - Iterator inb(src1, win_b); - Iterator out(dst, win_out); - - switch(src0->info()->data_type()) - { - case DataType::S8: - case DataType::QASYMM8_SIGNED: - { - vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window); - break; - } - case DataType::U8: - case DataType::QASYMM8: - { - vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window); - break; - } - default: - { - ARM_COMPUTE_ERROR("Not supported"); - break; - } - } - } - else - { - const size_t in_b_stride = src1->info()->strides_in_bytes()[1]; - const int width_b = src1->info()->dimension(0); - - // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1)); - - // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(_slide_matrix_b) - { - win_b = window; - } - win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride)); - win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); - - // The step x and step y for the output matrix has been already set using in configure() - Iterator ina(src0, win_a); - Iterator inb(src1, win_b); - Iterator out(dst, window); - - switch(src0->info()->data_type()) - { - case DataType::S8: - case DataType::QASYMM8_SIGNED: - { - matrix_multiply_s8(ina, inb, out, width_b, *dst->info(), window); - break; - } - case DataType::U8: - case DataType::QASYMM8: - { - matrix_multiply_u8(ina, inb, out, width_b, *dst->info(), window); - break; - } - default: - { - ARM_COMPUTE_ERROR("Not supported"); - break; - } - } - } -} - -const char *CpuGemmLowpMatrixMultiplyKernel::name() const -{ - return "CpuGemmLowpMatrixMultiplyKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h deleted file mode 100644 index 77d8741b19..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H -#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to multiply matrices - * - * @note @ref CpuGemmLowpMatrixMultiplyKernel low precision matrix product kernel - * This kernel performs the following computation: - * - * -# Convert a values from int8 to int32 - * -# Convert b values from int8 to int32 - * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 - * - */ -class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuGemmLowpMatrixMultiplyKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyKernel); - /** Initialise the kernel's input and output. - * - * The input matrices @p src0 and @p src1 must be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel. These two - * kernels change the layout of the original matrices to be more cache-friendly. - * - * @param[in] src0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED - * @param[in] src1 Input tensor info containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] dst Output tensor info to store the result of matrix multiplication. Data type supported: S32 - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpMatrixMultiplyKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - bool _slide_matrix_b{ true }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H*/ diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp deleted file mode 100644 index 270abc8bbd..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp +++ /dev/null @@ -1,396 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); - } - return Status{}; -} -Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); - } - return Status{}; -} -} // namespace - -void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(src, dst, info)); - _k = info.k; - _scalar = info.scalar; - _mul_by_scalar = info.mul_by_scalar; - - switch(src->data_type()) - { - case DataType::QASYMM8: - _func = &CpuGemmLowpMatrixAReductionKernel::run_internal; - break; - case DataType::QASYMM8_SIGNED: - case DataType::QSYMM8: - case DataType::QSYMM8_PER_CHANNEL: - _func = &CpuGemmLowpMatrixAReductionKernel::run_internal; - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type"); - } - - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, TensorShape(src->dimension(1)), 1, DataType::S32); - - Window win = calculate_max_window(*dst, Steps(1)); - ICpuKernel::configure(win); -} - -Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info)); - return Status{}; -} - -template -void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window) -{ - // Intermediate and final accumulator types - using TIAcc = wrapper::traits::promote_t; - using TAcc = wrapper::traits::promote_t; - - Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); - - Window win_input(collapsed_window); - win_input.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_input.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_input.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Iterator in(src, win_input); - Iterator out(dst, collapsed_window); - - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - auto vsum_row = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - TAcc sum_row = 0; - - const T *matrix_a = reinterpret_cast((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2])); - -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); -#endif /* __arm__ */ - - int i = 0; - // This for loop performs 16 accumulations - for(; i <= (_k - 16); i += 16) - { - const auto a0_d8 = wrapper::vloadq(matrix_a + i); - - // Partial accumulations in U16 - const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); - - // Accumulate to U32 - vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); - } - - // This for loop performs the leftover accumulations - for(; i < _k; ++i) - { - sum_row += static_cast(matrix_a[i]); - } - -#if defined(__aarch64__) - // Reduction operation available on 64 bit architectures only - sum_row += wrapper::vaddv(vsum_row); -#else // __aarch64__ - auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); - tmp = wrapper::vpadd(tmp, tmp); - - sum_row += wrapper::vgetlane(tmp, 0); -#endif // __aarch64__ - - // Multiply by scalar if necessary - if(_mul_by_scalar) - { - sum_row *= _scalar; - } - - *(reinterpret_cast(out.ptr())) = static_cast(sum_row); - }, - in, out); -} - -void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - (this->*_func)(src, dst, window); -} - -const char *CpuGemmLowpMatrixAReductionKernel::name() const -{ - return "CpuGemmLowpMatrixAReductionKernel"; -} - -void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info)); - - _k = info.k; - _scalar = info.scalar; - _mul_by_scalar = info.mul_by_scalar; - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 16; - - switch(src->data_type()) - { - case DataType::QASYMM8: - _func = &CpuGemmLowpMatrixBReductionKernel::run_internal; - break; - case DataType::QASYMM8_SIGNED: - case DataType::QSYMM8: - case DataType::QSYMM8_PER_CHANNEL: - _func = &CpuGemmLowpMatrixBReductionKernel::run_internal; - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type"); - } - - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, TensorShape(src->dimension(0)), 1, DataType::S32); - - // Configure kernel window - Window win = calculate_max_window_horizontal(*dst, Steps(num_elems_processed_per_iteration)); - ICpuKernel::configure(win); -} - -Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info)); - return Status{}; -} - -template -void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info) -{ - // Intermediate and final accumulator types - using TIAcc = wrapper::traits::promote_t; - using TAcc = wrapper::traits::promote_t; - - Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); - const auto vec_scalar = wrapper::vdup_n(static_cast(_scalar), wrapper::traits::vector_128_tag{}); - - const auto width_matrix_b = static_cast(src->info()->dimension(0)); - const auto in_b_stride = static_cast(src->info()->strides_in_bytes()[1]); - - // The implementation computes 16 elements per iteration - const int window_start_x = 16 * info.thread_id; - const int window_step_x = 16 * info.num_threads; - // Make sure (window_end_x - window_start_x) is a multiple of window_step_x - const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - - Window win_out(collapsed_window); - win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); - - Window win_in(win_out); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Iterator inb(src, win_in); - Iterator out(dst, win_out); - - execute_window_loop(win_out, [&](const Coordinates & id) - { - if(id.x() > width_matrix_b) - { - return; - } - - // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - typename wrapper::traits::neon_bitvector::type sum_col[4] = - { - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) - }; - - const auto *matrix_b = reinterpret_cast(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]); - -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride)); -#endif /* __arm__ */ - - int i = 0; - // This for loop performs 4 accumulations - for(; i <= (_k - 4); i += 4) - { - const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); - const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); - const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); - const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); - -#if __arm__ - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); -#endif /* __arm__ */ - - // Partial accumulation in 16bit - typename wrapper::traits::neon_bitvector::type tmp_sum[2] = - { - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}) - }; - - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); - - // Accumulate to 32bit - sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); - sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); - sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); - sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); - - matrix_b += 4 * in_b_stride; - } - - // This for loop perfoms the leftover accumulations - for(; i < _k; ++i) - { - const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); - - // Convert S8 to S16 - const typename wrapper::traits::neon_bitvector::type b0_b16[2] - { - wrapper::vmovl(wrapper::vgetlow(b0_b8)), - wrapper::vmovl(wrapper::vgethigh(b0_b8)) - }; - - // Accumulate to 32bit - sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); - sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); - sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); - sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); - - matrix_b += in_b_stride; - } - - // Multiply by scalar if necessary - if(_mul_by_scalar) - { - sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); - sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); - sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); - sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); - } - - auto vector_sum_col = reinterpret_cast(out.ptr()); - if(id.x() + 16 < width_matrix_b) - { - wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); - wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); - wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); - wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); - } - else - { - auto left_over = width_matrix_b - id.x(); - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(vector_sum_col + k * 4 + j) = sum_col[k][j]; - } - } - } - }, - inb, out); -} - -void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - (this->*_func)(src, dst, window, info); -} - -const char *CpuGemmLowpMatrixBReductionKernel::name() const -{ - return "CpuGemmLowpMatrixBReductionKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h deleted file mode 100644 index 106980fc0b..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H -#define ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -// Forward declarations -struct GEMMLowpReductionKernelInfo; -namespace cpu -{ -namespace kernels -{ -/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuGemmLowpMatrixAReductionKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixAReductionKernel); - /** Initialise the kernel's input and output. - * - * @param[in] src Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] dst Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k (num_mtx_a_cols) Number of matrix A columns - * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4 - * - scalar Scalar value to multiply each reduced row by. - * - mul_byscalar True if each reduced column must be multiplied by a scalar value. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpMatrixAReductionKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Execution of the reduction kernel specialized on the input type - * - * @param[in] src Input tensor - * @param[in] dst Output tensor - * @param[in] window Execution window - */ - template - void run_internal(const ITensor *src, ITensor *dst, const Window &window); - - /** Common signature for all reduction functions - * - * @param[in] src Input tensor - * @param[out] dst Output tensor - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window); - - CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr }; - int32_t _k{ 0 }; - int32_t _scalar{ 0 }; - bool _mul_by_scalar{ false }; -}; - -/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuGemmLowpMatrixBReductionKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixBReductionKernel); - /** Initialise the kernel's input and output. - * - * @param[in] src Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL - * @param[out] dst Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k (num_mtx_b_rows) Number of matrix B rows. - * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW. - * - scalar Scalar value to multiply each reduced row by. - * - mul_byscalar True if each reduced row must be multiplied by a scalar value. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpMatrixBReductionKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Execution of the reduction kernel specialized on the input type - * - * @param[in] src Input tensor - * @param[in] dst Output tensor - * @param[in] window Execution window - * @param[in] info Thread-related information - */ - template - void run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info); - - /** Common signature for all reduction functions - * - * @param[in] src Input tensor - * @param[out] dst Output tensor - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info); - - CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr }; - int32_t _k{ 0 }; - int32_t _scalar{ 0 }; - bool _mul_by_scalar{ false }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp deleted file mode 100644 index 9b1bf08955..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp +++ /dev/null @@ -1,417 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, - int32_t a_offset, int32_t b_offset) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); - - TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], - "mm_result tensor must have the same number of batches of output tensor"); - - if(a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - return Status{}; -} - -void run_offset_contribution(const Window &window, - ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, - int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d) -{ - Window collapsed_window = window.collapse_if_possible(window, Window::DimZ); - collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; - const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16; - - Iterator mm_result_it(mm_result, collapsed_window); - - if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true - { - // Set window for vector_sum_col - Window win_vector_sum_col(collapsed_window); - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row(collapsed_window); - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); - Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); - - const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); - - // Offset in case vector_sum_col is batched - const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); - - // Compute the leftover term due to b_offset. - int32_t b_offset_term_s32 = *(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); - b_offset_term_s32 *= b_offset; - - const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = - { - { - vld1q_s32(vector_sum_col_ptr + x + 0), - vld1q_s32(vector_sum_col_ptr + x + 4), - vld1q_s32(vector_sum_col_ptr + x + 8), - vld1q_s32(vector_sum_col_ptr + x + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); - - // Add a_offset_term_s32 and b_offset_term_s32 - int32x4x4_t offset_term_s32 = - { - { - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset) - } - }; - - offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec)); - offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec)); - offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec)); - offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec)); - - int32x4x4_t in_s32 = - { - { - vld1q_s32(mm_result_ptr + x + 0), - vld1q_s32(mm_result_ptr + x + 4), - vld1q_s32(mm_result_ptr + x + 8), - vld1q_s32(mm_result_ptr + x + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); - - // Store the result with the offset contribution - vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); - vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); - vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); - vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Compute the leftover term due to a_offset. - int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); - - a_offset_term_s32 *= a_offset; - - // Add the offset terms to GEMM's result - // Store the result with the offset contribution - mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32; - } - }, - vector_sum_col_it, vector_sum_row_it, mm_result_it); - } - else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true - { - ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); - - // Set window for vector_sum_row - Window win_vector_sum_row(collapsed_window); - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); - - const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); - - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); - - // Compute the leftover term due to b_offset. - int32_t b_offset_term_s32 = *(reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); - b_offset_term_s32 *= b_offset; - - const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(mm_result_ptr + x + 0), - vld1q_s32(mm_result_ptr + x + 4), - vld1q_s32(mm_result_ptr + x + 8), - vld1q_s32(mm_result_ptr + x + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec); - in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec); - in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec); - in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec); - - // Store the result with the offset contribution - vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); - vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); - vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); - vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Add the offset terms to GEMM's result - // Store the result with the offset contribution - mm_result_ptr[x] += b_offset_term_s32; - } - }, - vector_sum_row_it, mm_result_it); - } - else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false - { - // Set window for vector_sum_col - Window win_vector_sum_col(collapsed_window); - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); - - // Offset in case vector_sum_col is batched - const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - auto mm_result_ptr = reinterpret_cast(mm_result_it.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = - { - { - vld1q_s32(vector_sum_col_ptr + x + 0), - vld1q_s32(vector_sum_col_ptr + x + 4), - vld1q_s32(vector_sum_col_ptr + x + 8), - vld1q_s32(vector_sum_col_ptr + x + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); - - int32x4x4_t in_s32 = - { - { - vld1q_s32(mm_result_ptr + x + 0), - vld1q_s32(mm_result_ptr + x + 4), - vld1q_s32(mm_result_ptr + x + 8), - vld1q_s32(mm_result_ptr + x + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); - - // Store the result with the offset contribution - vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); - vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); - vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); - vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Compute the leftover term due to a_offset. - const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); - - // Add the offset terms to GEMM's result - // Store the result with the offset contribution - mm_result_ptr[x] += a_offset_term_s32 * a_offset; - } - }, - vector_sum_col_it, mm_result_it); - } - else // false, false - { - // No offset contribution from matrix A and matrix B - return; - } -} -} // namespace - -void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) -{ - // Perform validate step - ARM_COMPUTE_UNUSED(vector_sum_row); - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); - - _a_offset = a_offset; - _b_offset = b_offset; - _k_offset = a_offset * b_offset * k; - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - // Check if vector_sum_col_shape should be slidden or not - // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1; - } - - // Configure kernel window - Window win = calculate_max_window(*mm_result, Steps()); - ICpuKernel::configure(win); -} - -Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, - int32_t a_offset, int32_t b_offset) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); - return Status{}; -} - -void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto mm_result = tensors.get_tensor(TensorType::ACL_DST); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - - run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d); -} - -const char *CpuGemmLowpOffsetContributionKernel::name() const -{ - return "CpuGemmLowpOffsetContributionKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h deleted file mode 100644 index f23a46cde7..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H -#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel used to add the offset contribution after @ref CpuGemmLowpMatrixMultiplyKernel. The computation is performed in-place - * - * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), - * and adds to it the offset contribution of matrix A and matrix B in-place. - * - * The final result is: - * - * mm_result[i][k] = mm_result[i][k] + - * (vector_sum_col[k] * a_offset) + - * (vector_sum_row[i] * b_offset) + - * (a_offset * b_offset * k) - * - */ -class CpuGemmLowpOffsetContributionKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuGemmLowpOffsetContributionKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionKernel); - /** Initialise the kernel's input and output. - * - * @param[in, out] mm_result Input tensor containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - */ - void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpOffsetContributionKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - int32_t _k_offset{ 0 }; - bool _slide_vector_sum_col{ true }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp deleted file mode 100644 index 332ce6f013..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp +++ /dev/null @@ -1,946 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x) -{ - return - { - { - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(mm_result_it.ptr()) + x + 12) - } - }; -} - -inline int32x4x4_t load(const int32_t *ptr, int32_t x) -{ - return - { - { - vld1q_s32(ptr + x + 0), - vld1q_s32(ptr + x + 4), - vld1q_s32(ptr + x + 8), - vld1q_s32(ptr + x + 12) - } - }; -} - -inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b) -{ - return - { - { - vaddq_s32(a.val[0], b), - vaddq_s32(a.val[1], b), - vaddq_s32(a.val[2], b), - vaddq_s32(a.val[3], b) - } - }; -} - -inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b) -{ - return - { - { - vaddq_s32(a.val[0], b.val[0]), - vaddq_s32(a.val[1], b.val[1]), - vaddq_s32(a.val[2], b.val[2]), - vaddq_s32(a.val[3], b.val[3]) - } - }; -} - -inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar) -{ - return - { - { - vmulq_n_s32(a.val[0], mul_scalar), - vmulq_n_s32(a.val[1], mul_scalar), - vmulq_n_s32(a.val[2], mul_scalar), - vmulq_n_s32(a.val[3], mul_scalar) - } - }; -} - -inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier) -{ - return - { - { - vmulq_s32(a.val[0], vld1q_s32(multilpier)), - vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)), - vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), - vmulq_s32(a.val[3], vld1q_s32(multilpier + 12)) - } - }; -} - -inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x) -{ - int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x); - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); - return a_offset_term_s32; -} - -inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offset) -{ - int32x4_t b_offset_term_s32 = vld1q_dup_s32(vector_sum_row_ptr); - b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset); - return b_offset_term_s32; -} - -inline int32x4x4_t get_k_offset(int32_t k_offset) -{ - return - { - { - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset) - } - }; -} - -inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu) -{ - const static int32x4_t zero_s32 = vdupq_n_s32(0); - - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); - in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); - in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); - in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); - - // Saturate negative values - in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); - in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); - in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); - in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to U8 - uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); - - if(is_bounded_relu) - { - out_u8 = vmaxq_u8(out_u8, min_u8); - out_u8 = vminq_u8(out_u8, max_u8); - } - - return out_u8; -} - -inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) -{ - const static int32x4_t zero_s32 = vdupq_n_s32(0); - - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); - in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); - in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); - in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); - - // Saturate negative values - in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); - in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); - in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); - in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to S8 - int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - - if(is_bounded_relu) - { - out_s8 = vmaxq_s8(out_s8, min_s8); - out_s8 = vminq_s8(out_s8, max_s8); - } - - return out_s8; -} - -inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) -{ - const static int32x4_t zero_s32 = vdupq_n_s32(0); - - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0])); - in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1])); - in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2])); - in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3])); - - // Saturate negative values - in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); - in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); - in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); - in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to S8 - int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - - if(is_bounded_relu) - { - out_s8 = vmaxq_s8(out_s8, min_s8); - out_s8 = vminq_s8(out_s8, max_s8); - } - - return out_s8; -} - -template -struct VectorTyper -{ - using stype = T; - using vtype = typename wrapper::traits::neon_bitvector_t; -}; - -inline Window get_win_vector_sum(const Window &window) -{ - Window win_vector_sum(window); - win_vector_sum.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum.set(Window::DimZ, Window::Dimension(0, 0, 0)); - return win_vector_sum; -} - -inline Iterator get_vector_sum_col_it(const Window &window, const ITensor *vector_sum_col) -{ - Iterator vector_sum_col_it(vector_sum_col, get_win_vector_sum(window)); - return vector_sum_col_it; -} - -inline Iterator get_vector_sum_row_it(const Window &window, const ITensor *vector_sum_row) -{ - Window win_vector_sum_row = get_win_vector_sum(window); - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); - return vector_sum_row_it; -} - -inline Iterator get_bias_it(const Window &window, const ITensor *bias) -{ - Window win_bias(window); - win_bias.set(Window::DimY, Window::Dimension(0, 1, 1)); - win_bias.set(Window::DimZ, Window::Dimension(0, 1, 1)); - Iterator bias_it(bias, win_bias); - return bias_it; -} - -template -inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, - const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, - typename VT::vtype min_vec, typename VT::vtype max_vec, - int32_t a_offset, int32_t b_offset, int32_t k_offset, - int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound, - int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) -{ - int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; - if(!is_fixed_point) - { - // Combine quantization offset with other offsets. - offset_term_s32 = add_s32(offset_term_s32, result_offset_s32); - } - if(has_a_offset && has_b_offset) - { - offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset)); - } - if(has_b_offset) - { - offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset)); - } - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = load_results_input(mm_result_it, x); - - if(has_a_offset) - { - in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); - } - if(has_bias) - { - in_s32 = add_s32(in_s32, load(bias_ptr, x)); - } - if(!is_fixed_point || has_b_offset) - { - in_s32 = add_s32(in_s32, offset_term_s32); - } - if(!is_fixed_point) - { - in_s32 = mul_s32(in_s32, multiplier); - } - - if(is_fixed_point) - { - wrapper::vstore(reinterpret_cast(out_it.ptr() + x), - finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu)); - } - else - { - wrapper::vstore(reinterpret_cast(out_it.ptr() + x), - finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu)); - } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t in_value = *(reinterpret_cast(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); - - if(has_a_offset) - { - in_value += (*(vector_sum_col_ptr + x) * a_offset); - } - if(has_bias) - { - in_value += *(bias_ptr + x); - } - - if(is_fixed_point) - { - // Finalize and store the result - *reinterpret_cast(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, - static_cast(min_bound), - static_cast(max_bound), is_bounded_relu); - } - else - { - // Finalize quantization - in_value = (in_value * multiplier) >> shift; - - // Bound and store the result - if(is_bounded_relu) - { - in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); - } - *reinterpret_cast(out_it.ptr() + x) = static_cast(std::max(static_cast(std::numeric_limits::lowest()), - std::min(static_cast(std::numeric_limits::max()), in_value))); - } - } -} - -inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, - const int32_t *result_multipliers, const int32_t *result_shifts, - const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8, - int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound, - int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) -{ - int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; - if(!is_fixed_point) - { - // Combine quantization offset with other offsets. - offset_term_s32 = add_s32(offset_term_s32, result_offset); - } - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = load_results_input(mm_result_it, x); - - if(has_a_offset) - { - in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); - } - if(has_bias) - { - in_s32 = add_s32(in_s32, load(bias_ptr, x)); - } - if(!is_fixed_point) - { - in_s32 = add_s32(in_s32, offset_term_s32); - in_s32 = mul_s32(in_s32, result_multipliers + x); - } - - if(is_fixed_point) - { - vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu)); - } - else - { - vst1q_s8(reinterpret_cast(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu)); - } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t in_value = *(reinterpret_cast(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); - - if(has_a_offset) - { - in_value += (*(vector_sum_col_ptr + x) * a_offset); - } - if(has_bias) - { - in_value += *(bias_ptr + x); - } - - if(is_fixed_point) - { - // Finalize and store the result - *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast(min_bound), static_cast(max_bound), is_bounded_relu); - } - else - { - // Finalize quantization - in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]); - - // Bound and store the result - if(is_bounded_relu) - { - in_value = static_cast(std::max(min_bound, std::min(max_bound, in_value))); - } - *(out_it.ptr() + x) = static_cast(std::max(-128, std::min(127, in_value))); - } - } -} - -template -void run_offset_contribution_output_stage(const Window &window, - const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, - int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, - GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) -{ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - using Typer = VectorTyper; - - const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; - const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; - - const int32_t multiplier = output_stage.gemmlowp_multiplier; - const int32_t shift = output_stage.gemmlowp_shift; - const int32_t offset = output_stage.gemmlowp_offset; - const int32_t min_bound = output_stage.gemmlowp_min_bound; - const int32_t max_bound = output_stage.gemmlowp_max_bound; - - const int32x4_t result_offset_s32 = vdupq_n_s32(offset); - const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift); - const auto min_vec = wrapper::vdup_n(static_cast(min_bound), ExactTagType{}); - const auto max_vec = wrapper::vdup_n(static_cast(max_bound), ExactTagType{}); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Window collapsed_window = win.collapse_if_possible(win, Window::DimZ); - - Iterator mm_result_it(mm_result, win); - Iterator out_it(output, win); - - if((a_offset != 0) && (b_offset != 0)) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); - ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); - - Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); - Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row); - - const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); - - // Offset in case vector_sum_col is batched - const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - - if(bias != nullptr) - { - Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), - mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); - } - else - { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); - } - } - else if((a_offset == 0) && (b_offset != 0)) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); - - Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row); - - const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); - - if(bias != nullptr) - { - Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_row_it, bias_it, mm_result_it, out_it); - } - else - { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_row_ptr = reinterpret_cast(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_row_it, mm_result_it, out_it); - } - } - else if((a_offset != 0) && (b_offset == 0)) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); - - Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); - - // Offset in case vector_sum_col is batched - const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - - if(bias != nullptr) - { - Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, bias_it, mm_result_it, out_it); - } - else - { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, mm_result_it, out_it); - } - } - else - { - if(bias != nullptr) - { - Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window(nullptr, nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point); - }, - bias_it, mm_result_it, out_it); - } - else - { - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window(nullptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point); - }, - mm_result_it, out_it); - } - return; - } -} - -void run_offset_contribution_output_stage_symm(const Window &window, - const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, - int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, - GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) -{ - ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset); - - const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; - - const int32_t offset = output_stage.gemmlowp_offset; - const int32_t min_bound = output_stage.gemmlowp_min_bound; - const int32_t max_bound = output_stage.gemmlowp_max_bound; - - const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data(); - const int32_t *result_shifts = output_stage.gemmlowp_shifts.data(); - const int32x4_t result_offset_s32 = vdupq_n_s32(offset); - const int8x16_t min_s8 = vdupq_n_s8(static_cast(min_bound)); - const int8x16_t max_s8 = vdupq_n_s8(static_cast(max_bound)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Window collapsed_window = win.collapse_if_possible(win, Window::DimZ); - - Iterator mm_result_it(mm_result, win); - Iterator out_it(output, win); - - if(a_offset != 0) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); - - Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); - - // Offset in case vector_sum_col is batched - const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - - if(bias != nullptr) - { - Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, bias_it, mm_result_it, out_it); - } - else - { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, mm_result_it, out_it); - } - } - else - { - if(bias != nullptr) - { - Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast(bias_it.ptr()), mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point); - }, - bias_it, mm_result_it, out_it); - } - else - { - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point); - }, - mm_result_it, out_it); - } - return; - } -} - -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, - int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - if(output->data_type() != DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0); - } - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); - } - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); - - TensorShape output_shape = output->tensor_shape(); - if(output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], - "mm_result tensor must have the same number of batches of output tensor"); - - if(a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); - } - - return Status{}; -} -} // namespace - -void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, - const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, - int32_t k, int32_t a_offset, int32_t b_offset, - GEMMLowpOutputStageInfo output_stage) -{ - ARM_COMPUTE_UNUSED(vector_sum_row, bias); - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage)); - - _a_offset = a_offset; - _b_offset = b_offset; - _k_offset = a_offset * b_offset * k; - _output_stage = output_stage; - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - // Check if vector_sum_col_shape should be slidden or not - // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1; - } - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, mm_result->clone()->set_data_type(DataType::QASYMM8)); - - // Configure kernel window - Window win = calculate_max_window(*mm_result, Steps()); - - // Note: This kernel performs 16 elements per iteration. - // However, since we use a left-over for loop, we cannot have any read or write out of memory - // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped - ICpuKernel::configure(win); -} - -Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, - const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, - int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage)); - return Status{}; -} - -void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto mm_result = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_3); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(dst->info()->data_type()); - int32_t type_min_int = type_min.get(); - int32_t type_max_int = type_max.get(); - - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - - const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int); - - // Check if we need to perform fixed point requantization - const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; - - // Check if symmetric per-channel execution - const bool is_signed = dst->info()->data_type() == DataType::QASYMM8_SIGNED; - - // Check if symmetric per-channel execution - const bool is_symm = _output_stage.is_quantized_per_channel; - - if(is_symm) - { - run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, - reinterpret_as_3d, is_bounded_relu, is_fixed_point); - } - else - { - if(is_signed) - { - run_offset_contribution_output_stage(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, - reinterpret_as_3d, is_bounded_relu, is_fixed_point); - } - else - { - run_offset_contribution_output_stage(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, - reinterpret_as_3d, is_bounded_relu, is_fixed_point); - } - } -} - -const char *CpuGemmLowpOffsetContributionOutputStageKernel::name() const -{ - return "CpuGemmLowpOffsetContributionOutputStageKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h deleted file mode 100644 index 404f2c9496..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H -#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel used to add the offset contribution and perform the output stage after @ref CpuGemmLowpMatrixMultiplyKernel. - * - * The computation is performed in-place - * - * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), - * and adds to it the offset contribution of matrix A and matrix B in-place. - * - * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8. - * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8. - * - * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is: - * - * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift - * - * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is: - * - * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * and mm_result'[i][k] = mm_result[i][k] + - * (vector_sum_col[k] * a_offset) + - * (vector_sum_row[i] * b_offset) + - * (a_offset * b_offset * k) - */ - -class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuGemmLowpOffsetContributionOutputStageKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionOutputStageKernel); - /** Initialise the kernel inputs and output. - * - * @param[in] mm_result Input tensor info containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector tensor info of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector tensor info of sums of all the entries in each row of matrix A. - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. - * @param[out] dst Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters. - */ - void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset, - int32_t b_offset, - GEMMLowpOutputStageInfo output_stage); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpOffsetContributionOutputStageKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, - int32_t b_offset, - GEMMLowpOutputStageInfo output_stage); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Function to use for the particular tensors passed to configure() */ - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - int32_t _k_offset{ 0 }; - bool _slide_vector_sum_col{ true }; - GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp deleted file mode 100644 index f1c797244a..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); - } - - if(dst->total_size() != 0) - { - if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED)) - { - ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types"); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} - -inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int) -{ - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32); - in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32); - in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32); - in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32); - - // Multiply by result_mult_int - in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int); - in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int); - in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int); - in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int); -} - -template -inline typename std::enable_if::value, - typename wrapper::traits::neon_vector::type>::type - convert_to_8bit(const int16x8x2_t in_s16) -{ - return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1])); -} - -template -inline typename std::enable_if::value, - typename wrapper::traits::neon_vector::type>::type - convert_to_8bit(const int16x8x2_t in_s16) -{ - return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1])); -} - -template -inline typename wrapper::traits::neon_vector::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector::type min, - typename wrapper::traits::neon_vector::type max) -{ - // Shift final result (negative value shift right) - in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); - in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); - in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); - in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); - - // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; - - // Convert S16 to S8 or U8 - typename wrapper::traits::neon_vector::type out = convert_to_8bit(in_s16); - - out = wrapper::vmax(out, min); - out = wrapper::vmin(out, max); - - return out; -} -} // namespace - -template -void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) -{ - using VectorType = typename wrapper::traits::neon_vector::type; - - const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->gemmlowp_offset); - const int32x4_t result_shift_s32 = vdupq_n_s32(-_output_stage->gemmlowp_shift); - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - const int clamp_min = (_is_bounded_relu) ? _output_stage->gemmlowp_min_bound : std::numeric_limits::lowest(); - const int clamp_max = (_is_bounded_relu) ? _output_stage->gemmlowp_max_bound : std::numeric_limits::max(); - - VectorType min = wrapper::vdup_n(static_cast(clamp_min), wrapper::traits::vector_128_tag{}); - VectorType max = wrapper::vdup_n(static_cast(clamp_max), wrapper::traits::vector_128_tag{}); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator out(dst, win); - - if(bias != nullptr) - { - Window win_biases; - win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator bias_i(bias, win_biases); - execute_window_loop(win, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - const int32x4x4_t bias_s32 = - { - { - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 12) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - - // Add the offset terms to GEMM's result and multiply by result_mult_int - scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); - - wrapper::vstore(reinterpret_cast(out.ptr() + x), finalize_quantization(in_s32, result_shift_s32, min, max)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int bias_value = *(reinterpret_cast(bias_i.ptr()) + x); - int in_value = *(reinterpret_cast(in.ptr()) + x); - - // Quantize - in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; - - // Store the result - *(out.ptr() + x) = static_cast(utility::clamp(in_value, clamp_min, clamp_max)); - } - }, - in, bias_i, out); - } - else - { - execute_window_loop(win, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - // Add the offset terms to GEMM's result and multiply by result_mult_int - scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); - - wrapper::vstore(reinterpret_cast(out.ptr() + x), finalize_quantization(in_s32, result_shift_s32, min, max)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int in_value = *(reinterpret_cast(in.ptr()) + x); - - // Quantize - in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; - - // Store the result - *(out.ptr() + x) = static_cast(utility::clamp(in_value, clamp_min, clamp_max)); - } - }, - in, out); - } -} - -void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) -{ - ARM_COMPUTE_UNUSED(bias); - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, output_stage); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - bias, - dst, - output_stage)); - - _output_stage = output_stage; - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - - ICpuKernel::configure(win); - - // Check if we need to clamp the result using min and max - _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) - && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)))); - if(_output_stage->output_data_type == DataType::QASYMM8) - { - _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal; - } - else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED) - { - _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal; - } - else - { - ARM_COMPUTE_ERROR("Data type not supported"); - } -} - -Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); - return Status{}; -} - -void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - (this->*_func)(src, bias, dst, window); -} - -const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const -{ - return "CpuGemmLowpQuantizeDownInt32ScaleKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h deleted file mode 100644 index ca5e1b40fc..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H -#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -// Forward declarations -class ITensor; -namespace cpu -{ -namespace kernels -{ -/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Add bias to final result if bias tensor is not a nullptr - * -# Shift the int32 accumulator by result_shift - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values: - * -# -to the [0..255] range and cast to QASYMM8. - * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. - * - */ -class CpuGemmLowpQuantizeDownInt32ScaleKernel : public ICpuKernel -{ -public: - CpuGemmLowpQuantizeDownInt32ScaleKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ScaleKernel); - /** Initialise the kernel's input and output. - * - * @param[in] src Input tensor info. Data type supported: S32 - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[out] output_stage GEMMLowp output stage metadata. - */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpQuantizeDownInt32ScaleKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Template function to run the NEGEMMLowpQuantizeDownInt32ScaleKernel - * - * @param[in] src Input tensor info - * @param[in] bias Biases tensor info - * @param[out] dst Output tensor info - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()) - */ - template - void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - - /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ScaleKernel functions - * - * @param[in] src Input tensor info - * @param[in] bias Biases tensor info - * @param[out] dst Output tensor info - * @param[in] window Region on which to execute the kernel. - */ - using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - - QuantizeDownFunctionPtr _func{ nullptr }; - const GEMMLowpOutputStageInfo *_output_stage{ nullptr }; - bool _is_bounded_relu{ false }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp deleted file mode 100644 index 390e269cbb..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/NESymm.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(min > max); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); - } - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); - } - - return Status{}; -} -} // namespace - -template -void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) -{ - const int16x8_t min_s16 = vdupq_n_s16(static_cast(_min)); - const int16x8_t max_s16 = vdupq_n_s16(static_cast(_max)); - - ARM_COMPUTE_UNUSED(min_s16); - ARM_COMPUTE_UNUSED(max_s16); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win_collapsed); - Iterator out(dst, win_collapsed); - if(bias != nullptr) - { - Window win_biases; - win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator bias_i(bias, win_biases); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x2_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4) - } - }; - - const int32x4x2_t bias_s32 = - { - { - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - - vst1q_s16(reinterpret_cast(out.ptr()) + x, finalize_quantization_int16(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t bias_value = *(reinterpret_cast(bias_i.ptr()) + x); - int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Add bias - in_value += bias_value; - // Finalize and store the result - *(reinterpret_cast(out.ptr()) + x) = finalize_quantization_int16(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast(_min), - static_cast(_max)); - } - }, - in, out, bias_i); - } - else - { - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x2_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4) - } - }; - - vst1q_s16(reinterpret_cast(out.ptr()) + x, finalize_quantization_int16(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - ARM_COMPUTE_UNUSED(in_value); - // Finalize and store the result - *(reinterpret_cast(out.ptr()) + x) = finalize_quantization_int16(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast(_min), - static_cast(_max)); - } - }, - in, out); - } -} - -void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, - int min, int max) -{ - // Perform validate step - ARM_COMPUTE_UNUSED(bias, dst); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max)); - - _result_fixedpoint_multiplier = result_fixedpoint_multiplier; - _result_shift = result_shift; - _min = min; - _max = max; - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*src, src->clone()->set_data_type(DataType::QSYMM16)); - // Configure kernel window - Window win_config = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win_config); - - // Check if we need to clamp the result using min and max - const bool is_bounded_relu = !(min <= -32768 && max >= 32767); - _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal : - &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal; -} - -Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); - return Status{}; -} - -void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - (this->*_func)(src, bias, dst, window); -} - -const char *CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::name() const -{ - return "CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h deleted file mode 100644 index e360e65bae..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H -#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -// Forward declaration -class ITensor; -namespace cpu -{ -namespace kernels -{ -/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16 - * - * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Round to nearest division by a power-of-two using result_shift - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16. - * - */ -class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel -{ -public: - CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel); - /** Initialise the kernel's input and output. - * - * @param[in] src Input tensor info. Data type supported: S32 - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] dst Output tensor info. Data type supported: Data type supported: QSYMM16 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0. - */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel - * - * @param[in] src Input tensor info - * @param[in] bias Bias tensor info - * @param[out] dst Output tensor info - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - - /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions - * - * @param[in] src Input tensor info - * @param[in] bias Bias tensor info - * @param[out] dst Output tensor info - * @param[in] window Region on which to execute the kernel. - */ - using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)( - const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - - QuantizeDownFunctionPtr _func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _min{ 0 }; - int _max{ 0 }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp deleted file mode 100644 index 318b6a06f8..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(min > max); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); - } - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); - } - - return Status{}; -} -} // namespace - -template -void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) -{ - const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); - const int8x16_t min_s8 = vdupq_n_s8(static_cast(_min)); - const int8x16_t max_s8 = vdupq_n_s8(static_cast(_max)); - - ARM_COMPUTE_UNUSED(min_s8, max_s8); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win_collapsed); - Iterator out(dst, win_collapsed); - if(bias != nullptr) - { - Window win_biases; - win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator bias_i(bias, win_biases); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - const int32x4x4_t bias_s32 = - { - { - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 12) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - - vst1q_s8(reinterpret_cast(out.ptr() + x), - finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t bias_value = *(reinterpret_cast(bias_i.ptr()) + x); - int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Add bias - in_value += bias_value; - // Finalize and store the result - *reinterpret_cast(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, - static_cast(_min), static_cast(_max), is_bounded_relu); - } - }, - in, out, bias_i); - } - else - { - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - vst1q_s8(reinterpret_cast(out.ptr() + x), - finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Finalize and store the result - *reinterpret_cast(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, - static_cast(_min), static_cast(_max), is_bounded_relu); - } - }, - in, out); - } -} - -void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) -{ - ARM_COMPUTE_UNUSED(bias); - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max)); - - _result_fixedpoint_multiplier = result_fixedpoint_multiplier; - _result_shift = result_shift; - _result_offset_after_shift = result_offset_after_shift; - _min = min; - _max = max; - - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8_SIGNED)); - - // Configure kernel window - Window win_config = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win_config); - - // Check if we need to clamp the result using min and max - const bool is_bounded_relu = !(min <= -128 && max >= 127); - _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal : - &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal; -} - -Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); - return Status{}; -} - -void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - (this->*_func)(src, bias, dst, window); -} - -const char *CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::name() const -{ - return "CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h deleted file mode 100644 index 9c213abdf7..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H -#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -// Forward declaration -class ITensor; -namespace cpu -{ -namespace kernels -{ -/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Round to nearest division by a power-of-two using result_shift - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED. - * - */ -class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel -{ -public: - CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel); - /** Initialise the kernel's input and output. - * - * @param[in] src Input tensor info. Data type supported: S32 - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8_SIGNED - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions - */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel - * - * @param[in] src Input tensor info - * @param[in] bias Bias tensor info - * @param[out] dst Output tensor info - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - - /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions - * - * @param[in] src Input tensor info - * @param[in] bias Bias tensor info - * @param[out] dst Output tensor info - * @param[in] window Region on which to execute the kernel. - */ - using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)( - const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - - QuantizeDownFunctionPtr _func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _result_offset_after_shift{ 0 }; - int _min{ 0 }; - int _max{ 0 }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp deleted file mode 100644 index 6631a4fc67..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(min > max); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); - } - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); - } - - return Status{}; -} -} // namespace - -template -void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) -{ - const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); - const uint8x16_t min_u8 = vdupq_n_u8(static_cast(_min)); - const uint8x16_t max_u8 = vdupq_n_u8(static_cast(_max)); - - ARM_COMPUTE_UNUSED(min_u8); - ARM_COMPUTE_UNUSED(max_u8); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win_collapsed); - Iterator out(dst, win_collapsed); - if(bias != nullptr) - { - Window win_biases; - win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator bias_i(bias, win_biases); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - const int32x4x4_t bias_s32 = - { - { - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(bias_i.ptr()) + x + 12) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t bias_value = *(reinterpret_cast(bias_i.ptr()) + x); - int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Add bias - in_value += bias_value; - // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max), is_bounded_relu); - } - }, - in, out, bias_i); - } - else - { - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = - { - { - vld1q_s32(reinterpret_cast(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast(in.ptr()) + x + 12) - } - }; - - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t in_value = *(reinterpret_cast(in.ptr()) + x); - - // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast(_min), static_cast(_max), is_bounded_relu); - } - }, - in, out); - } -} - -void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) -{ - ARM_COMPUTE_UNUSED(bias); - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max)); - - _result_fixedpoint_multiplier = result_fixedpoint_multiplier; - _result_shift = result_shift; - _result_offset_after_shift = result_offset_after_shift; - _min = min; - _max = max; - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8)); - - // Configure kernel window - auto win_config = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win_config); - - // Check if we need to clamp the result using min and max - const bool is_bounded_relu = !(min <= 0 && max >= 255); - _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal : - &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal; -} - -Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); - return Status{}; -} - -void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - (this->*_func)(src, bias, dst, window); -} - -const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() const -{ - return "CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h deleted file mode 100644 index 13b30f3427..0000000000 --- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H -#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -// Forward declaration -class ITensor; -namespace cpu -{ -namespace kernels -{ -/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 - * - * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Round to nearest division by a power-of-two using result_shift - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. - * - */ -class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel -{ -public: - CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel); - /** Initialise the kernel's input and output. - * - * @param[in] src Input tensor info. Data type supported: S32 - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions - */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Template function to run the CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - - /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions - * - * @param[in] window Region on which to execute the kernel. - */ - using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)( - const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - - QuantizeDownFunctionPtr _func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _result_offset_after_shift{ 0 }; - int _min{ 0 }; - int _max{ 0 }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp deleted file mode 100644 index da0f7b135e..0000000000 --- a/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, float beta) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - const float32x4_t beta_f32 = vdupq_n_f32(beta); - - constexpr int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win = window.collapse_if_possible(window, Window::DimZ); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator out(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) - { - float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x); - const float32x4x4_t c = vld4q_f32(in_ptr + x); - - // Multiply matrix C by its weight and accumulate - alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32); - alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32); - alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32); - alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32); - - vst4q_f32(out_ptr + x, alpha_ab); - } - - // Left-over loop - for(; x < window_end_x; ++x) - { - *(out_ptr + x) += *(in_ptr + x) * beta; - } - }, - in, out); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, float beta) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - const float16x8_t beta_f16 = vdupq_n_f16(beta); - - constexpr int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win = window.collapse_if_possible(window, Window::DimZ); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, win); - Iterator out(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) - { - float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x); - const float16x8x2_t c = vld2q_f16(in_ptr + x); - // Multiply matrix C by its weight and accumulate - alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16)); - alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16)); - - vst2q_f16(out_ptr + x, alpha_ab); - } - - // Left-over loop - for(; x < window_end_x; ++x) - { - *(out_ptr + x) += *(in_ptr + x) * static_cast(beta); - } - }, - in, out); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -} // namespace - -void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta) -{ - ARM_COMPUTE_UNUSED(dst); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta)); - - _beta = beta; - switch(src->data_type()) - { - case DataType::F32: - _func = &matrix_addition_f32; - break; - case DataType::F16: -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - _func = &matrix_addition_f16; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - ICPPKernel::configure(win); -} - -Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_UNUSED(beta); - - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - return Status{}; -} - -void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - ARM_COMPUTE_ERROR_ON(tensors.empty()); - - const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - - if(_beta != 0.0f) - { - (*_func)(src, dst, window, _beta); - } -} - -const char *CpuGemmMatrixAdditionKernel::name() const -{ - return "CpuGemmMatrixAdditionKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h deleted file mode 100644 index f9450b962b..0000000000 --- a/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H -#define ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta: - * - * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size - * - * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have: - * - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref CpuGemmMatrixMultiplyKernel - * - MTX_1 = C - */ -class CpuGemmMatrixAdditionKernel : public ICpuKernel -{ -public: - CpuGemmMatrixAdditionKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixAdditionKernel); - /** Initialise the kernel's input and output. - * - * @note The input and output tensor must have the same dimensions - * - * @param[in] src Input tensor info (Matrix C). Data types supported: F16/F32 - * @param[in, out] dst Output tensor info. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref CpuGemmMatrixMultiplyKernel. Data type supported: the same as @p src. - * @param[in] beta Weight of matrix C - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, float beta); - /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixAdditionKernel. - * - * @note The input and output tensor must have the same dimensions - * - * Similar to @ref CpuGemmMatrixAdditionKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Common signature for all the matrix addition functions - * - * @param[in] src An input tensor. Data types supported: F16/F32 - * @param[out] dst The output tensor. Data type supported: same as @p src - * @param[in] window Region on which to execute the kernel. - * @param[in] beta Weight of matrix C - */ - using MatrixAdditionFunctionPtr = void (*)(const ITensor *src, ITensor *dst, const Window &window, float beta); - /** Matrix addition function to use for the particular tensor types passed to configure() */ - MatrixAdditionFunctionPtr _func{ nullptr }; - float _beta{ 0.f }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp deleted file mode 100644 index d86ea064de..0000000000 --- a/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp +++ /dev/null @@ -1,1174 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/helpers/float_ops.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) -{ - const auto width_matrix_b = static_cast(dst->info()->dimension(0)); - const auto in_b_stride = static_cast(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size()); - const auto num_elems_vec_a = static_cast(lhs->info()->dimension(0)); - - // The implementation computes 32 elements per iteration - const int window_start_x = 32 * info.thread_id; - const int window_step_x = 32 * info.num_threads; - const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x"); - - Window win_out(window); - win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) - { - win_b = window; - } - win_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator ina(lhs, win_a); - Iterator inb(rhs, win_b); - Iterator out(dst, win_out); - - const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); - - const float16x8_t alpha_f16 = vdupq_n_f16(alpha); - - execute_window_loop(win_out, [&](const Coordinates &) - { - int x = window_start_x; - // Here we don't check for x lower equal than (window_end_x - window_step_x) because of - // window_end_x is computed above which may cause out-of-bound writes to the dst. - for(; x < (window_end_x - window_step_x); x += window_step_x) - { - if(x > width_matrix_b) - { - return; - } - - auto matrix_b = reinterpret_cast(inb.ptr()) + x; - - float16x8_t acc0 = vdupq_n_f16(0.f); - float16x8_t acc1 = vdupq_n_f16(0.f); - float16x8_t acc2 = vdupq_n_f16(0.f); - float16x8_t acc3 = vdupq_n_f16(0.f); - - auto vec_a = reinterpret_cast(ina.ptr()); - const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4);) - { - const float16x4_t a0l = vld1_f16(vec_a); - - float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); - float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); - float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); - float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); - float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); - float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); - float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); - float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); - - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0)); - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1)); - - matrix_b += 2 * in_b_stride; - - b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); - b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); - b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); - b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); - b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); - b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); - b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); - b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); - - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2)); - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3)); - - vec_a += 4; - matrix_b += 2 * in_b_stride; - } - - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float16_t a0 = *vec_a; - const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); - const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); - const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); - const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); - - acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0)); - acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0)); - acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0)); - acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0)); - - matrix_b += in_b_stride; - } - - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc0 = vmulq_f16(acc0, alpha_f16); - acc1 = vmulq_f16(acc1, alpha_f16); - acc2 = vmulq_f16(acc2, alpha_f16); - acc3 = vmulq_f16(acc3, alpha_f16); - } - - auto vec_out = reinterpret_cast(out.ptr()) + x; - - vst1q_f16(vec_out + 0, acc0); - vst1q_f16(vec_out + 8, acc1); - vst1q_f16(vec_out + 16, acc2); - vst1q_f16(vec_out + 24, acc3); - } - - for(; x < window_end_x; ++x) - { - if(x > width_matrix_b) - { - return; - } - - auto matrix_b = reinterpret_cast(inb.ptr()) + x; - - float16x4_t vacc = vdup_n_f16(0.f); - - auto vec_a = reinterpret_cast(ina.ptr()); - const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4) - { - const float16x4_t a0l = vld1_f16(vec_a); - - const float16x4_t b_col = - { - *(matrix_b + 0 * in_b_stride), - *(matrix_b + 1 * in_b_stride), - *(matrix_b + 2 * in_b_stride), - *(matrix_b + 3 * in_b_stride), - }; - - vacc = vadd_f16(vacc, vmul_f16(a0l, b_col)); - - matrix_b += 4 * in_b_stride; - } - - float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3); - - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float16_t a0 = *vec_a; - const float16_t b00 = *matrix_b; - - acc += b00 * a0; - - matrix_b += in_b_stride; - } - - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc *= static_cast(alpha); - } - - auto vec_out = reinterpret_cast(out.ptr()) + x; - - *(vec_out) = acc; - } - }, - ina, inb, out); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) -{ - const auto width_matrix_b = static_cast(dst->info()->dimension(0)); - const auto in_b_stride = static_cast(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type())); - const auto num_elems_vec_a = static_cast(lhs->info()->dimension(0)); - - // The implementation computes 16 elements per iteration - const int window_start_x = 16 * info.thread_id; - const int window_step_x = 16 * info.num_threads; - // Make sure (window_end_x - window_start_x) is a multiple of window_step_x - const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - - Window win_out(window); - win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) - { - win_b = window; - } - win_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator ina(lhs, win_a); - Iterator inb(rhs, win_b); - Iterator out(dst, win_out); - - const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); - - const float32x4_t alpha_f32 = vdupq_n_f32(alpha); - - execute_window_loop(win_out, [&](const Coordinates &) - { - int x = window_start_x; - // Here we don't check for x lower equal than (window_end_x - window_step_x) because of - // window_end_x is computed above which may cause out-of-bound writes to the dst. - for(; x < (window_end_x - window_step_x); x += window_step_x) - { - if(x > width_matrix_b) - { - return; - } - - float32x4_t acc0 = vdupq_n_f32(0.f); - float32x4_t acc1 = vdupq_n_f32(0.f); - float32x4_t acc2 = vdupq_n_f32(0.f); - float32x4_t acc3 = vdupq_n_f32(0.f); - - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()) + x; - -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b + in_b_stride))); -#endif /* __arm__ */ - - auto vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4);) - { - float32x2_t a0l = vld1_f32(vec_a); - - float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); - float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); - float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); - float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); - - float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); - float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); - float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); - float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); - -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 1 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 2 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 3 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 4 * in_b_stride))); -#endif /* __arm__ */ - - acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); - acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); - acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); - acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); - - acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); - acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); - acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); - acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); - - vec_a += 2; - matrix_b += 2 * in_b_stride; - - a0l = vld1_f32(vec_a); - - b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); - b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); - b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); - b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); - - b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); - b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); - b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); - b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); - - acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); - acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); - acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); - acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); - - acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); - acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); - acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); - acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); - - vec_a += 2; - matrix_b += 2 * in_b_stride; - } - - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float a0 = *vec_a; - - const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); - const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); - const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); - const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); - - acc0 = vmlaq_n_f32(acc0, b00, a0); - acc1 = vmlaq_n_f32(acc1, b01, a0); - acc2 = vmlaq_n_f32(acc2, b02, a0); - acc3 = vmlaq_n_f32(acc3, b03, a0); - - matrix_b += in_b_stride; - } - - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc0 = vmulq_f32(acc0, alpha_f32); - acc1 = vmulq_f32(acc1, alpha_f32); - acc2 = vmulq_f32(acc2, alpha_f32); - acc3 = vmulq_f32(acc3, alpha_f32); - } - - const auto vec_out = reinterpret_cast(out.ptr()) + x; - - vst1q_f32(vec_out + 0, acc0); - vst1q_f32(vec_out + 4, acc1); - vst1q_f32(vec_out + 8, acc2); - vst1q_f32(vec_out + 12, acc3); - } - - // Left-over loop - for(; x < window_end_x; ++x) - { - if(x > width_matrix_b) - { - return; - } - - float32x4_t vacc = vdupq_n_f32(0.f); - - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()) + x; - -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b + in_b_stride))); -#endif /* __arm__ */ - - auto vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4) - { - const float32x4_t a0l = vld1q_f32(vec_a); - - const float32x4_t b_col = - { - *(matrix_b + 0 * in_b_stride), - *(matrix_b + 1 * in_b_stride), - *(matrix_b + 2 * in_b_stride), - *(matrix_b + 3 * in_b_stride), - }; - -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 1 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 2 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 3 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 4 * in_b_stride))); -#endif /* __arm__ */ - - vacc = vmlaq_f32(vacc, b_col, a0l); - - matrix_b += 4 * in_b_stride; - } - - float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3); - - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float a0 = *vec_a; - - const float b00 = *matrix_b; - - acc += b00 * a0; - - matrix_b += in_b_stride; - } - - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc *= alpha; - } - - const auto vec_out = reinterpret_cast(out.ptr()) + x; - - *vec_out = acc; - } - }, - ina, inb, out); -} - -void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) -{ - ARM_COMPUTE_UNUSED(info); - const int out_width = static_cast(dst->info()->dimension(0)); - const int out_height = static_cast(dst->info()->dimension(1)); - const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); - const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); - const size_t out_stride2 = out_stride1 * 2; - const size_t out_stride3 = out_stride1 * 3; - const int num_elems_matrix_b_x = rhs->info()->dimension(0); - - // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); - - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) - { - win_b = window; - } - // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the dst matrix - // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4 - win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride)); - win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Iterator ina(lhs, win_a); - Iterator inb(rhs, win_b); - Iterator out(dst, window); - - const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); - - const float32x4_t alpha_f32 = vdupq_n_f32(alpha); - - // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW - // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration - // All the values needed for computing a single 4x4 block will be read from consecutive memory positions - execute_window_loop(window, [&](const Coordinates & id) - { - auto mtx_a0 = reinterpret_cast(ina.ptr()); - auto mtx_b0 = reinterpret_cast(inb.ptr()); - auto mtx_b1 = mtx_b0 + in_b_stride; - - float32x4_t acc00 = vdupq_n_f32(0.f); - float32x4_t acc10 = vdupq_n_f32(0.f); - float32x4_t acc20 = vdupq_n_f32(0.f); - float32x4_t acc30 = vdupq_n_f32(0.f); - - float32x4_t acc01 = vdupq_n_f32(0.f); - float32x4_t acc11 = vdupq_n_f32(0.f); - float32x4_t acc21 = vdupq_n_f32(0.f); - float32x4_t acc31 = vdupq_n_f32(0.f); - -#if __arm__ - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_a0))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_b0))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_b1))); -#endif /* __arm__ */ - - auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; - for(; mtx_b0 <= (mtx_b0_end_addr - 32);) - { - float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); - float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); - float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); - float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); - - float32x4_t b00 = vld1q_f32(mtx_b0); - float32x4_t b10 = vld1q_f32(mtx_b1); - float32x4_t b01 = vld1q_f32(mtx_b0 + 4); - float32x4_t b11 = vld1q_f32(mtx_b1 + 4); - -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_a0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b1))); -#endif /* __arm__ */ - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); - float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); - float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); - float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); - -#if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_a0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b1))); -#endif /* __arm__ */ - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - } - - for(; mtx_b0 < mtx_b0_end_addr;) - { - float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); - float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); - float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); - float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); - float32x4_t b00 = vld1q_f32(mtx_b0); - float32x4_t b10 = vld1q_f32(mtx_b1); - -#if __arm__ - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_a0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b1))); -#endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - mtx_a0 += 4; - mtx_b0 += 4; - mtx_b1 += 4; - } - - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc00 = vmulq_f32(acc00, alpha_f32); - acc10 = vmulq_f32(acc10, alpha_f32); - acc20 = vmulq_f32(acc20, alpha_f32); - acc30 = vmulq_f32(acc30, alpha_f32); - acc01 = vmulq_f32(acc01, alpha_f32); - acc11 = vmulq_f32(acc11, alpha_f32); - acc21 = vmulq_f32(acc21, alpha_f32); - acc31 = vmulq_f32(acc31, alpha_f32); - } - - const auto mtx_out0 = reinterpret_cast(out.ptr()); - const auto mtx_out1 = mtx_out0 + 4; - - if(id.x() < (out_width - 8)) - { - vst1q_f32(mtx_out0, acc00); - vst1q_f32(mtx_out1, acc01); - if(id.y() + 1 < out_height) - { - vst1q_f32(mtx_out0 + out_stride1, acc10); - vst1q_f32(mtx_out1 + out_stride1, acc11); - if(id.y() + 2 < out_height) - { - vst1q_f32(mtx_out0 + out_stride2, acc20); - vst1q_f32(mtx_out1 + out_stride2, acc21); - if(id.y() + 3 < out_height) - { - vst1q_f32(mtx_out0 + out_stride3, acc30); - vst1q_f32(mtx_out1 + out_stride3, acc31); - } - } - } - } - else if(id.x() < (out_width - 4)) - { - vst1q_f32(mtx_out0, acc00); - if(id.y() + 1 < out_height) - { - vst1q_f32(mtx_out0 + out_stride1, acc10); - if(id.y() + 2 < out_height) - { - vst1q_f32(mtx_out0 + out_stride2, acc20); - if(id.y() + 3 < out_height) - { - vst1q_f32(mtx_out0 + out_stride3, acc30); - } - } - } - // Left-over columns - const int columns_left = out_width - id.x() - 4; - for(auto x = 0; x < columns_left; ++x) - { - *(mtx_out1 + x) = acc01[x]; - if(id.y() + 1 < out_height) - { - *(mtx_out1 + x + out_stride1) = acc11[x]; - if(id.y() + 2 < out_height) - { - *(mtx_out1 + x + out_stride2) = acc21[x]; - if(id.y() + 3 < out_height) - { - *(mtx_out1 + x + out_stride3) = acc31[x]; - } - } - } - } - } - else - { - // Left-over columns - const int columns_left = out_width - id.x(); - for(int x = 0; x < columns_left; ++x) - { - *(mtx_out0 + x) = acc00[x]; - if(id.y() + 1 < out_height) - { - *(mtx_out0 + x + out_stride1) = acc10[x]; - if(id.y() + 2 < out_height) - { - *(mtx_out0 + x + out_stride2) = acc20[x]; - if(id.y() + 3 < out_height) - { - *(mtx_out0 + x + out_stride3) = acc30[x]; - } - } - } - } - } - }, - ina, inb, out); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) -{ - ARM_COMPUTE_UNUSED(info); - const int out_width = static_cast(dst->info()->dimension(0)); - const int out_height = static_cast(dst->info()->dimension(1)); - const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); - const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); - const int num_elems_matrix_b_x = rhs->info()->dimension(0); - - // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); - - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) - { - win_b = window; - } - // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the dst matrix - win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride)); - win_b.set(Window::DimY, Window::Dimension(0, 1, 0)); - - Iterator ina(lhs, win_a); - Iterator inb(rhs, win_b); - Iterator out(dst, window); - - const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); - - const float16x8_t alpha_f16 = vdupq_n_f16(alpha); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto *mtx_a0 = reinterpret_cast(ina.ptr()); - const auto *mtx_b0 = reinterpret_cast(inb.ptr()); - auto *mtx_out = reinterpret_cast(out.ptr()); - float16x8x4_t c = - { - { - vdupq_n_f16(0.f), - vdupq_n_f16(0.f), - vdupq_n_f16(0.f), - vdupq_n_f16(0.f) - } - }; - - /* - This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) - |a00 a01 a02 a03 | a04 a05 a06 a07| - |a10 a11 a12 a13 | a14 a15 a16 a17| - |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ... - |a30 a31 a32 a33 | a34 a35 a36 a37| | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ... - |a40 a41 a42 a43 | a44 a45 a46 a47| - |a50 a51 a52 a53 | a54 a55 a56 a57| - |a60 a61 a62 a63 | a64 a65 a66 a67| - |a70 a71 a72 a73 | a74 a75 a76 a77| - - After this operation, the dst matrix will have the following shape: [ height * 4, width / 4 ] - - B Matrix has been transposed as shown below - - |b00 b01 b02 b03 b04 b05 b06 b07| - |b10 b11 b12 b13 b14 b15 b16 b17| - |b20 b21 b22 b23 b24 b25 b26 b27| - |b30 b31 b32 b33 b34 b35 b36 b37| - -------------------> - - |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37| - - c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30 - c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31 - - The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size. - */ - const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; - - for(; mtx_b0 <= (mtx_b0_end_addr - 32);) - - { - const float16x8_t p00 = vld1q_f16(mtx_a0); - const float16x8_t p02 = vld1q_f16(mtx_a0 + 8); - - const float16x8_t q00 = vld1q_f16(mtx_b0); - const float16x8_t q02 = vld1q_f16(mtx_b0 + 8); - const float16x8_t q04 = vld1q_f16(mtx_b0 + 16); - const float16x8_t q06 = vld1q_f16(mtx_b0 + 24); - - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3))); - - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7))); - - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3))); - - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7))); - - mtx_a0 += 16; - mtx_b0 += 32; - } - - for(; mtx_b0 < mtx_b0_end_addr;) - - { - const float16x4_t p00 = vld1_f16(mtx_a0); - const float16x8_t q00 = vld1q_f16(mtx_b0); - - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3))); - - mtx_a0 += 4; - mtx_b0 += 8; - } - - if(multiply_alpha) - { - c.val[0] = vmulq_f16(c.val[0], alpha_f16); - c.val[1] = vmulq_f16(c.val[1], alpha_f16); - c.val[2] = vmulq_f16(c.val[2], alpha_f16); - c.val[3] = vmulq_f16(c.val[3], alpha_f16); - } - - if(id.x() < (out_width - 8)) - { - vst1q_f16(mtx_out, c.val[0]); - if(id.y() + 1 < out_height) - { - vst1q_f16(mtx_out + 1 * out_stride, c.val[1]); - if(id.y() + 2 < out_height) - { - vst1q_f16(mtx_out + 2 * out_stride, c.val[2]); - if(id.y() + 3 < out_height) - { - vst1q_f16(mtx_out + 3 * out_stride, c.val[3]); - } - } - } - } - else - { - // Left-over columns - const int columns_left = out_width - id.x(); - for(int x = 0; x < columns_left; ++x) - { - *(mtx_out + x) = c.val[0][x]; - if(id.y() + 1 < out_height) - { - *(mtx_out + x + 1 * out_stride) = c.val[1][x]; - if(id.y() + 2 < out_height) - { - *(mtx_out + x + 2 * out_stride) = c.val[2][x]; - if(id.y() + 3 < out_height) - { - *(mtx_out + x + 3 * out_stride) = c.val[3][x]; - } - } - } - } - } - }, - ina, inb, out); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info) -{ - ARM_COMPUTE_UNUSED(alpha); - - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); - - if(!is_interleaved) - { - ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1)); - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); - } - } - else - { - const int m = reshape_info.m(); - const int n = reshape_info.n(); - const int k = reshape_info.k(); - const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); - const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); - - /* Interleave */ - TensorShape tensor_shape0{ lhs->tensor_shape() }; - tensor_shape0.set(0, k); - tensor_shape0.set(1, m); - - const TensorInfo tensor_info0 = lhs->clone()->set_tensor_shape(tensor_shape0); - const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0); - - if(n != 0) /* Transpose */ - { - TensorShape tensor_shape1{ rhs->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1); - } - - if(dst->total_size() != 0) - { - if(n != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast(n)); - } - ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(1) != static_cast(m)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); - } - } - - return Status{}; -} -} // namespace - -void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); - - // dst tensor auto inizialitation if not yet initialized - TensorShape tensor_shape{ lhs->tensor_shape() }; - tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0)); - tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1)); - - auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(tensor_shape)); - - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info)); - - _alpha = alpha; - - // Configure kernel window - Window win{}; - - // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication - const bool is_dst_vector = (dst->dimension(1) == 1); - if(is_dst_vector) - { - const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32; - - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x)); - } - else - { - constexpr unsigned int num_elems_processed_per_iteration_x = 8; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - } - - switch(lhs->data_type()) - { - case DataType::F32: - { - _func = (is_dst_vector) ? vector_matrix_multiply_f32 : matrix_matrix_multiply_f32; - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - _func = (is_dst_vector) ? vector_matrix_multiply_f16 : matrix_matrix_multiply_f16; - break; - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - { - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - } - ICPPKernel::configure(win); -} - -Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, - const GEMMReshapeInfo &reshape_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info)); - - return Status{}; -} - -void CpuGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - ARM_COMPUTE_ERROR_ON(tensors.empty()); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - const ITensor *lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0); - const ITensor *rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - - (*_func)(lhs, rhs, dst, window, info, _alpha); -} - -const char *CpuGemmMatrixMultiplyKernel::name() const -{ - return "CpuGemmMatrixMultiplyKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h deleted file mode 100644 index 974ff85606..0000000000 --- a/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H -#define ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication - * - * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p lhs and @p rhs are both matrices and reshaped respectively with @ref CpuGemmInterleave4x4Kernel" and @ref CpuGemmTranspose1xWKernel - * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p lhs is a vector and the second input tensor @p rhs a matrix. The implementation also assumes that both tensors have not been reshaped - * - */ -class CpuGemmMatrixMultiplyKernel : public ICpuKernel -{ -public: - CpuGemmMatrixMultiplyKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixMultiplyKernel); - /** Initialise the kernel's input and output. - * - * @note If the output tensor is a matrix, the input matrices @p lhs and @p rhs should be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel - * These two kernels change the layout of the original matrices to be more cache-friendly. - * - * @param[in] lhs Left-handside tensor info containing the interleaved Matrix A or the vector A. Data types supported: F16/F32 - * @param[in] rhs Right-handside tensor info containing the transposed Matrix B if the first input tensor A is not a vector. - * If the output tensor is a vector, rhs must contain the matrix B not reshaped. Data type supported: same as @p lhs - * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p lhs. - * @param[in] alpha Weight of the matrix product - * @param[in] is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel - * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped - */ - void configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel - * - * Similar to @ref CpuGemmMatrixMultiplyKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Common signature for all the matrix multiply functions - * - * @param[in] lhs Left-handside input tensor. Data types supported: F16/F32 - * @param[in] rhs Right-handside input tensor. Data types supported: same as @p lhs - * @param[out] dst The output tensor. Data type supported: same as @p rhs - * @param[in] window Region on which to execute the kernel. - * @param[in] info Thread info metadata. - * @param[in] alpha Weight of the matrix product. - */ - using GemmFunctionPtr = void(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); - /** Matrix multiply function to use for the particular tensor types passed to configure() */ - GemmFunctionPtr *_func{ nullptr }; - float _alpha{ 1.f }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp deleted file mode 100644 index 4b059f57cb..0000000000 --- a/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -using namespace arm_compute::misc::shape_calculator; - -void CpuGemmTranspose1xWKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*src))); - - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmTranspose1xWKernel::validate(src, dst)); - - const size_t vector_size = 16 / src->element_size(); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps(vector_size)); - ICPPKernel::configure(win); -} - -Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_transpose1xW_with_element_size_shape(*src)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - } - - return Status{}; -} - -void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - ARM_COMPUTE_ERROR_ON(tensors.empty()); - - /* - * Following an example of how the transposition1xW works when the src data type is F32 - * - * |a00 a01 a02 a03| - * |a10 a11 a12 a13| - * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 | - * |a30 a31 a32 a33| - * - * The dst matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) - */ - - // Set window for dst tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications - Window win_out(window); - win_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - - const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - - Iterator in(src, window); - Iterator out(dst, win_out); - - const size_t in_width = src->info()->dimension(0); - const size_t element_size = src->info()->element_size(); - const size_t out_stride = dst->info()->strides_in_bytes()[1]; - const size_t vector_size = 16 / element_size; - - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8_t *in_ptr = in.ptr(); - uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride; - - for(size_t k = 0; k < vector_size; ++k) - { - // If the src width is not multiple of W, we fill the reference with 0s - if((id.x() + k) >= in_width) - { - std::memset(out_ptr + k * element_size, 0, element_size); - } - else - { - std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size); - } - } - }, - in, out); -} - -const char *CpuGemmTranspose1xWKernel::name() const -{ - return "CpuGemmTranspose1xWKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h deleted file mode 100644 index 1a9287f7b0..0000000000 --- a/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H -#define ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor) - * - * Following an example of how the transposition1xW works when the input data is F32 - * - * @f[ - * \left( \begin{array}{cccc} - * a00 & a01 & a02 & a03 \\ - * a10 & a11 & a12 & a13 \\ - * a20 & a21 & a22 & a23 \\ - * a30 & a31 & a32 & a33 \\ - * \end{array} \right) - * \rightarrow - * \left( \begin{array}{ccccccccccccccccc} - * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\ - * \end{array} \right) - * @f] - * - * Following an example of how the transposition1xW works when the input data type is F16 - * - * @f[ - * \left( \begin{array}{cccccccc} - * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 \\ - * a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 \\ - * a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 \\ - * a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37 \\ - * \end{array} \right) - * \rightarrow - * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc} - * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\ - * \end{array} \right) - * @f] - * - * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) - * - */ -class CpuGemmTranspose1xWKernel : public ICpuKernel -{ -public: - CpuGemmTranspose1xWKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmTranspose1xWKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] src Input tensor info. Data types supported: All - * @param[out] dst Output tensor info. Data type supported: same as @p src. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmTranspose1xWKernel - * - * Similar to @ref CpuGemmTranspose1xWKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuIm2ColKernel.cpp b/src/core/cpu/kernels/CpuIm2ColKernel.cpp deleted file mode 100644 index ca6c9bfab4..0000000000 --- a/src/core/cpu/kernels/CpuIm2ColKernel.cpp +++ /dev/null @@ -1,448 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuIm2ColKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include -#include -#include -#include -#include - -namespace arm_compute -{ -using namespace misc::shape_calculator; -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias); - ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon"); - - // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions - const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right(); - const unsigned total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom(); - ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height)); - - if(output->total_size() > 0) - { - TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - } - - return Status{}; -} - -template -inline void linearize_volume_nchw(const uint8_t *const in_ptr, - T *out_ptr, - bool has_bias, - int top_left_x, - int top_left_y, - int kernel_width, - int kernel_height, - int kernel_depth, - int input_w, - int input_h, - int input_stride_x, - int input_stride_y, - int input_stride_z, - int pad_value, - int dilation_x, - int dilation_y) -{ - const int kernel_size2 = kernel_width * kernel_height; - const int x_e = top_left_x + kernel_width * dilation_x; - const int y_e = top_left_y + kernel_height * dilation_y; - - // Linearize volume - int d = 0; - // This for loop linearize a volume with 3 slices. This allows: - // 1) to reduce the iterations of the outer for loop "d" - // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs - for(; d <= (kernel_depth - 3); d += 3) - { - for(int y = top_left_y; y < y_e; y += dilation_y) - { - if((y < 0 || y >= input_h) && has_pads) - { - // All the values will be the offset (will be zeros when not quantized) - for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) - { - *(out_ptr + 0 * kernel_size2) = pad_value; - *(out_ptr + 1 * kernel_size2) = pad_value; - *(out_ptr + 2 * kernel_size2) = pad_value; - } - } - else - { - for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) - { - if((x < 0 || x >= input_w) && has_pads) - { - *(out_ptr + 0 * kernel_size2) = pad_value; - *(out_ptr + 1 * kernel_size2) = pad_value; - *(out_ptr + 2 * kernel_size2) = pad_value; - } - else - { - *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x))); - *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x))); - *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x))); - } - } - } - } - out_ptr += 2 * kernel_size2; - } - - // Left over - for(; d < kernel_depth; d++) - { - for(int y = top_left_y; y < y_e; y += dilation_y) - { - if((y < 0 || y >= input_h) && has_pads) - { - // All the values will be the offset (will be zeros when not quantized) - memset(static_cast(out_ptr), pad_value, kernel_width * sizeof(T)); - out_ptr += kernel_width; - } - else - { - for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) - { - if((x < 0 || x >= input_w) && has_pads) - { - *out_ptr = pad_value; - } - else - { - *out_ptr = *(reinterpret_cast(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x))); - } - } - } - } - } - - // Append 1 if the convolution layer has biases - if(has_bias) - { - *out_ptr = static_cast(1); - } -} - -template -inline void linearize_volume_nhwc(const uint8_t *const in_ptr, - T *out_ptr, - bool has_bias, - int start_x, - int start_y, - int kernel_width, - int kernel_height, - int input_w, - int input_h, - int input_c, - int input_stride_y, - int input_stride_z, - int pad_value, - int dilation_x, - int dilation_y) -{ - const int end_x = start_x + kernel_width * dilation_x; - const int end_y = start_y + kernel_height * dilation_y; - const int pad_quant = kernel_width * input_c; - const int element_size = static_cast(sizeof(T)); - if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size)) - { - for(int y = start_y; y < end_y; y += dilation_y) - { - //optimized for no dilation and no boundary pixels - memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size); - out_ptr += input_c * kernel_width; - } - } - else - { - for(int y = start_y; y < end_y; y += dilation_y) - { - if(y < 0 || y >= input_h) - { - memset(static_cast(out_ptr), pad_value, pad_quant * element_size); - out_ptr += pad_quant; - } - else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size) - { - for(int x = start_x; x < end_x; x += dilation_x) - { - if(x < 0 || x >= input_w) - { - memset(static_cast(out_ptr), pad_value, input_c * element_size); - out_ptr += input_c; - } - else - { - memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size); - out_ptr += input_c; - } - } - } - else - { - //optimized for no dilation and no boundary pixels - memcpy(out_ptr, reinterpret_cast(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size); - out_ptr += input_c * kernel_width; - } - } - } - // Append 1 if the convolution layer has biases - if(has_bias) - { - *out_ptr = static_cast(1); - } -} -} // namespace - -template -void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window &window) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - - const int input_w = src->info()->dimension(width_idx); - const int input_h = src->info()->dimension(height_idx); - const int input_c = src->info()->dimension(channel_idx); - const int input_stride_x = src->info()->strides_in_bytes().x(); - const int input_stride_y = src->info()->strides_in_bytes().y(); - const int input_stride_z = src->info()->strides_in_bytes().z(); - const int pad_left = _conv_info.pad_left(); - const int pad_top = _conv_info.pad_top(); - const int stride_x = _conv_info.stride().first; - const int stride_y = _conv_info.stride().second; - const int pad_value = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0; - - Window window_in_out(window); - // The first three dimensions of the input and output are increased by the inner loops - window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Create iterators - Iterator in(src, window_in_out); - Iterator out(dst, window_in_out); - - execute_window_loop(window, [&](const Coordinates & id) - { - const int start_w = id[width_idx] * stride_x - pad_left; - const int start_h = id[height_idx] * stride_y - pad_top; - - // Get pointers - const uint8_t *const input_ptr = in.ptr(); - auto output_ptr = reinterpret_cast(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y()); - - // Linearize volume - if(is_nchw) - { - linearize_volume_nchw(input_ptr, - output_ptr, - _has_bias, - start_w, - start_h, - _kernel_width, - _kernel_height, - input_c, - input_w, - input_h, - input_stride_x, - input_stride_y, - input_stride_z, - pad_value, - _dilation.x(), - _dilation.y()); - } - else - { - linearize_volume_nhwc(input_ptr, - output_ptr, - _has_bias, - start_w, - start_h, - _kernel_width, - _kernel_height, - input_w, - input_h, - input_c, - input_stride_y, - input_stride_z, - pad_value, - _dilation.x(), - _dilation.y()); - } - }, - in, out); -} - -void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); - ARM_COMPUTE_UNUSED(num_groups); - - _data_layout = src->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - - _conv_info = conv_info; - _kernel_width = kernel_dims.width; - _kernel_height = kernel_dims.height; - _dilation = dilation; - _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), - _kernel_width, _kernel_height, - _conv_info, _dilation); - _has_bias = has_bias; - - if(_data_layout == DataLayout::NCHW) - { - switch(src->data_type()) - { - case DataType::F32: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; - break; -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) - case DataType::BFLOAT16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; - break; -#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::QASYMM8_SIGNED: - case DataType::QASYMM8: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - } - else - { - switch(src->data_type()) - { - case DataType::F32: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; - break; -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) - case DataType::BFLOAT16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; - break; -#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::QASYMM8: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; - break; - case DataType::QASYMM8_SIGNED: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col : &CpuIm2ColKernel::run_im2col; - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - break; - } - } - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false))); - - std::pair convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), - kernel_dims.width, kernel_dims.height, - conv_info, dilation); - - Window win = calculate_max_window(*src, Steps()); - win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1)); - win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1)); - win.set(channel_idx, Window::Dimension(0, 1, 1)); - // Configure kernel window - ICpuKernel::configure(win); -} - -Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); - return Status{}; -} - -void CpuIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - (this->*_func)(src, dst, window); -} -const char *CpuIm2ColKernel::name() const -{ - return "CpuIm2ColKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuIm2ColKernel.h b/src/core/cpu/kernels/CpuIm2ColKernel.h deleted file mode 100644 index ffac5077b2..0000000000 --- a/src/core/cpu/kernels/CpuIm2ColKernel.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_IM2COL_KERNEL_H -#define ARM_COMPUTE_CPU_IM2COL_KERNEL_H - -#include "arm_compute/core/Size2D.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -class ITensor; -namespace cpu -{ -namespace kernels -{ -/** Interface for the im2col reshape kernel. - * - * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column. - * It is used to transform a convolution to a plain matrix multiplication. - * - * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have: - * - * @f[ - * \left( \begin{array}{cccc} - * a00 & a01 & a02 & a03 \\ - * a10 & a11 & a12 & a13 \\ - * a20 & a21 & a22 & a23 \\ - * a30 & a31 & a32 & a33 \\ - * \end{array} \right) - * \rightarrow - * \left( \begin{array}{ccccccccc} - * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\ - * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\ - * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\ - * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\ - * \end{array} \right) - * @f] - */ -class CpuIm2ColKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuIm2ColKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuIm2ColKernel); - /** Set the input and output of the kernel. - * - * @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32 - * Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false - * @param[out] dst The output tensor info. Data types supported: Same as @p input - * @param[in] kernel_dims The kernel dimensions (width and height). - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] has_bias In case biases are provided expands the matrix with 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuIm2ColKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Template function to run im2col - * - * @param[in] src The input tensor info - * @param[out] dst The output tensor info - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - */ - template - void run_im2col(const ITensor *src, ITensor *dst, const Window &window); - - /** Common signature for all the specialised im2col functions - * - * @param[in] window Region on which to execute the kernel. - */ - using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window); - - Im2ColFunctionPtr _func{ nullptr }; - std::pair _convolved_dims{}; - PadStrideInfo _conv_info{}; - unsigned int _kernel_width{ 0 }; - unsigned int _kernel_height{ 0 }; - bool _has_bias{ false }; - Size2D _dilation{ 1U, 1U }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_IM2COL_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuMulKernel.cpp b/src/core/cpu/kernels/CpuMulKernel.cpp deleted file mode 100644 index 82ec322875..0000000000 --- a/src/core/cpu/kernels/CpuMulKernel.cpp +++ /dev/null @@ -1,1729 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuMulKernel.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NESymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -const float scale255_constant = 1.f / 255.f; -const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant); -const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f); - -inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_UNUSED(overflow_policy); - ARM_COMPUTE_UNUSED(rounding_policy); - - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, - DataType::S32, DataType::F16, DataType::F32); - if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized"); - } - - if(dst->total_size() > 0) - { - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - // clang-format off - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) && - !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && - !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) && - !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && - !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && - !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32) - , "Invalid data type combination"); - // clang-format on - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst"); - } - - if(std::abs(scale - scale255_constant) < 0.00001f) - { - ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32, - "Scale == 1/255 is not supported if input and dst are of data type S32"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO); - - int exponent = 0; - const float normalized_mantissa = std::frexp(scale, &exponent); - - // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 - // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 - // Moreover, it will be negative as we deal with 1/2^n - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255"); - } - - return Status{}; -} - -/* Scales a given vector by 1/255. - * - * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats. - * - * @param in Input vector to scale. - * @return Scaled dst rounded to nearest (round half up). - */ -inline int32x4_t scale255_S32_S32(int32x4_t in) -{ - // Scale - const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q); - // Round to nearest (round half up) - // Add +0.5 for all values - // Afterwards vcvt rounds toward zero - return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q)); -} - -inline uint16x8_t scale255_U16_U16(uint16x8_t in) -{ - const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in)))); - const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in)))); - return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1))); -} - -template -inline typename std::enable_if::value, int8x16_t>::type -vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) -{ - return vquantize_signed(val, info); -} - -template -inline typename std::enable_if::value, uint8x16_t>::type -vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) -{ - return vquantize(val, info); -} - -template -void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); - const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo); - const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo); - - const float32x4x4_t out_f32x4x4 = - { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - // Quantize dst - const auto result = vquantize(out_f32x4x4, tmp_qua_info); - wrapper::vstore(output_ptr + x, result); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - const T src1 = *(non_broadcast_input_ptr + x); - const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, non_broadcast_qinfo); - const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); - const float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst - const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); - *(output_ptr + x) = tmp_qua; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto input1_q = wrapper::vloadq(input1_ptr + x); - const auto input2_q = wrapper::vloadq(input2_ptr + x); - - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); - const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); - - const float32x4x4_t out_f32x4x4 = - { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - // Quantize dst - const auto result = vquantize(out_f32x4x4, tmp_qua_info); - wrapper::vstore(output_ptr + x, result); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - const T src1 = *(input1_ptr + x); - const T src2 = *(input2_ptr + x); - const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, input1_qua_info); - const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(src2, input2_qua_info); - const float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst - const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); - *(output_ptr + x) = tmp_qua; - } - }, - input1, input2, dst); - } -} - -void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); - const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); - - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const qsymm16x8x2_t input1_q = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const qsymm16x8x2_t input2_q = - { - { - vld1q_s16(input2_ptr + x), - vld1q_s16(input2_ptr + x + 8), - } - }; - - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); - const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); - - const float32x4x4_t out_f32x4x4 = - { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - float tmp_in1 = static_cast(*(input1_ptr + x)) * input1_qua_info.scale; - float tmp_in2 = static_cast(*(input2_ptr + x)) * input2_qua_info.scale; - float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst, lrintf() has same rounding mode as vcombine_s16 - int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale); - qsymm16_t tmp_qua = static_cast(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - *(output_ptr + x) = tmp_qua; - } - }, - input1, input2, dst); -} - -void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale) -{ - ARM_COMPUTE_UNUSED(scale); - - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const qsymm16x8x2_t input1_q = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const qsymm16x8x2_t input2_q = - { - { - vld1q_s16(input2_ptr + x), - vld1q_s16(input2_ptr + x + 8), - } - }; - - const int32x4x4_t in1_s32 = - { - { - vmovl_s16(vget_low_s16(input1_q.val[0])), - vmovl_s16(vget_high_s16(input1_q.val[0])), - vmovl_s16(vget_low_s16(input1_q.val[1])), - vmovl_s16(vget_high_s16(input1_q.val[1])), - } - }; - const int32x4x4_t in2_s32 = - { - { - vmovl_s16(vget_low_s16(input2_q.val[0])), - vmovl_s16(vget_high_s16(input2_q.val[0])), - vmovl_s16(vget_low_s16(input2_q.val[1])), - vmovl_s16(vget_high_s16(input2_q.val[1])), - } - }; - - const int32x4x4_t result = - { - { - vmulq_s32(in1_s32.val[0], in2_s32.val[0]), - vmulq_s32(in1_s32.val[1], in2_s32.val[1]), - vmulq_s32(in1_s32.val[2], in2_s32.val[2]), - vmulq_s32(in1_s32.val[3], in2_s32.val[3]), - } - }; - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - vst1q_s32(output_ptr + x + 8, result.val[2]); - vst1q_s32(output_ptr + x + 12, result.val[3]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - *(output_ptr + x) = tmp; - } - }, - input1, input2, dst); -} - -template -void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16 / sizeof(uint8_t); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x); - const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x); - - uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); - const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); - uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); - const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); - - tmp1_high = vmulq_u16(tmp1_high, tmp2_high); - tmp1_low = vmulq_u16(tmp1_low, tmp2_low); - - if(is_scale255) - { - tmp1_high = scale255_U16_U16(tmp1_high); - tmp1_low = scale255_U16_U16(tmp1_low); - } - else - { - const int16x8_t vn = vdupq_n_s16(-n); - - if(is_sat) - { - tmp1_high = vqshlq_u16(tmp1_high, vn); - tmp1_low = vqshlq_u16(tmp1_low, vn); - } - else - { - tmp1_high = vshlq_u16(tmp1_high, vn); - tmp1_low = vshlq_u16(tmp1_low, vn); - } - } - if(is_sat) - { - vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); - } - else - { - vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); - } - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - uint16_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - tmp = static_cast(tmp_f + 0.5f); - } - else - { - tmp >>= n; - } - if(is_sat && tmp > 255) - { - tmp = 255; - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); -} - -template -inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n) -{ - int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1)); - const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2)); - int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1)); - const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2)); - - tmp1_high = vmulq_s32(tmp1_high, tmp2_high); - tmp1_low = vmulq_s32(tmp1_low, tmp2_low); - - if(is_scale255) - { - tmp1_high = scale255_S32_S32(tmp1_high); - tmp1_low = scale255_S32_S32(tmp1_low); - } - else - { - // Right shift amount - const int32x4_t vn = vdupq_n_s32(-n); - // Left shift amount - const int32x4_t vnl = vdupq_n_s32(n); - // Calculate conversion bit - const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high); - const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low); - const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31); - const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31); - const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high); - const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low); - const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s); - const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s); - if(is_sat) - { - tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); - tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); - } - else - { - tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); - tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); - } - } - - if(is_sat) - { - return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high)); - } - else - { - return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high)); - } -} - -template -inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n) -{ - const int16x8x2_t result = - { - { - // First 8 elements - mul_S16_S16_S16_n_loop(src1.val[0], src2.val[0], n), - // Second 8 elements - mul_S16_S16_S16_n_loop(src1.val[1], src2.val[1], n) - } - }; - - return result; -} - -template -void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8x2_t ta1 = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const int16x8x2_t ta2 = - { - { - vld1q_s16(input2_ptr + x), - vld1q_s16(input2_ptr + x + 8), - } - }; - const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); - - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - - tmp = static_cast(tmp_f + 0.5f); - } - else - { - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint32_t mask = (1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; - } - } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); -} - -template -inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n) -{ - const int32x2_t input1_1 = vget_low_s32(src1); - const int32x2_t input2_1 = vget_low_s32(src2); - const int32x2_t input1_2 = vget_high_s32(src1); - const int32x2_t input2_2 = vget_high_s32(src2); - - int64x2_t tmp_1 = vmull_s32(input1_1, input2_1); - int64x2_t tmp_2 = vmull_s32(input1_2, input2_2); - - // Apply scaling, conversion and rounding (round to zero) - // Right shift amount - const int64x2_t vn = vdupq_n_s64(-n); - // Left shift amount - const int64x2_t vnl = vdupq_n_s64(n); - // Calculate conversion bit - const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1); - const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63); - const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1); - const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s); - - const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2); - const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63); - const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2); - const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s); - if(is_sat) - { - tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn); - tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn); - return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2)); - } - else - { - tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn); - tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn); - return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2)); - } -} - -template -inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n) -{ - const int32x4x2_t result = - { - { - // First 4 elements - mul_S32_S32_S32_n_loop(src1.val[0], src2.val[0], n), - // Second 4 elements - mul_S32_S32_S32_n_loop(src1.val[1], src2.val[1], n) - } - }; - - return result; -} - -template -void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const int32_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = vdupq_n_s32(broadcast_value); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x2_t broadcast_v = - { - { - broadcast_value_vec, - broadcast_value_vec, - } - }; - const int32x4x2_t non_broadcast_v = - { - { - vld1q_s32(non_broadcast_input_ptr + x), - vld1q_s32(non_broadcast_input_ptr + x + 4), - } - }; - const int32x4x2_t result = mul_S32_S32_S32_n_k(broadcast_v, non_broadcast_v, n); - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int64_t tmp = static_cast(broadcast_value) * static_cast(*(non_broadcast_input_ptr + x)); - - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint64_t mask = ((uint64_t)1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; - } - if(is_sat) - { - tmp = utility::clamp(tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x2_t ta1 = - { - { - vld1q_s32(input1_ptr + x), - vld1q_s32(input1_ptr + x + 4), - } - }; - const int32x4x2_t ta2 = - { - { - vld1q_s32(input2_ptr + x), - vld1q_s32(input2_ptr + x + 4), - } - }; - const int32x4x2_t result = mul_S32_S32_S32_n_k(ta1, ta2, n); - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int64_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint64_t mask = ((uint64_t)1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; - } - if(is_sat) - { - tmp = utility::clamp(tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); - } -} - -void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - constexpr int window_step_x = 16 / sizeof(float); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto ta1 = wrapper::vloadq(input1_ptr + x); - const auto ta2 = wrapper::vloadq(input2_ptr + x); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); - } -} - -void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - constexpr int window_step_x = 8 / sizeof(float); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x); - float32x4_t b = vdupq_n_f32(broadcast_value); - - const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; - const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); - const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); - const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); - const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); - - const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); - const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); - - float32x4_t res = wrapper::vmul(tmp0, b); - b = wrapper::vmul(b, mask); - - res = wrapper::vmla(res, tmp1, b); - wrapper::vstore(output_ptr + 2 * x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x); - const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1); - auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1); - auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0); - *(output_ptr + 2 * x) = res1; - *(output_ptr + 2 * x + 1) = res2; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x); - float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x); - - const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; - const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); - const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); - const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); - const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); - - const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); - const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); - - float32x4_t res = wrapper::vmul(tmp0, b); - - b = wrapper::vrev64(b); - b = wrapper::vmul(b, mask); - - res = wrapper::vmla(res, tmp1, b); - wrapper::vstore(output_ptr + 2 * x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto a0 = *(input1_ptr + 2 * x); - const auto a1 = *(input1_ptr + 2 * x + 1); - const auto b0 = *(input2_ptr + 2 * x); - const auto b1 = *(input2_ptr + 2 * x + 1); - auto res1 = a0 * b0 - a1 * b1; - auto res2 = a0 * b1 + a1 * b0; - *(output_ptr + 2 * x) = res1; - *(output_ptr + 2 * x + 1) = res2; - } - }, - input1, input2, dst); - } -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - constexpr int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float16x8x2_t broadcast_value_vec = - { - { - vdupq_n_f16(broadcast_value), - vdupq_n_f16(broadcast_value), - } - }; - const auto scale_vec = vdupq_n_f16(scale); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t non_broadcast_v = - { - { - vld1q_f16(non_broadcast_input_ptr + x), - vld1q_f16(non_broadcast_input_ptr + x + 8), - } - }; - const float16x8x2_t result = - { - { - vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), - vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), - } - }; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t ta1 = - { - { - vld1q_f16(input1_ptr + x), - vld1q_f16(input1_ptr + x + 8), - } - }; - const float16x8x2_t ta2 = - { - { - vld1q_f16(input2_ptr + x), - vld1q_f16(input2_ptr + x + 8), - } - }; - const float16x8_t scale_vec = vdupq_n_f16(scale); - const float16x8x2_t result = - { - { - vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), - vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), - } - }; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); - } -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template -void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16 / sizeof(uint8_t); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t bv = wrapper::vloadq(input2_ptr + x); - const uint8x16_t av = wrapper::vloadq(input1_ptr + x); - - uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); - uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); - tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); - tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); - - if(is_scale255) - { - tmp_low = scale255_U16_U16(tmp_low); - tmp_high = scale255_U16_U16(tmp_high); - } - else - { - const int16x8_t vn = vdupq_n_s16(-n); - - if(is_sat) - { - tmp_low = vqshlq_u16(tmp_low, vn); - tmp_high = vqshlq_u16(tmp_high, vn); - } - else - { - tmp_low = vshlq_u16(tmp_low, vn); - tmp_high = vshlq_u16(tmp_high, vn); - } - } - - if(is_sat) - { - static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); - - tmp_low = vminq_u16(tmp_low, max); - tmp_high = vminq_u16(tmp_high, max); - } - - vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low)); - vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - tmp = static_cast(tmp_f + 0.5f); - } - else - { - tmp >>= n; - } - - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp; - } - - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); -} - -template -void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8x2_t ta1 = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const uint8x8x2_t ta2u = - { - { - vld1_u8(input2_ptr + x), - vld1_u8(input2_ptr + x + 8), - } - }; - const int16x8x2_t ta2 = - { - { - vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), - vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1])) - } - }; - - const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); - - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - - tmp = static_cast(tmp_f + 0.5f); - } - else - { - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint32_t mask = (1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; - } - } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); -} - -template -void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Simply swap the two input buffers - mul_S16_U8_S16(src2, src1, out, window, n); -} -} // namespace - -void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_UNUSED(rounding_policy); - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - // Auto initialize dst if not initialized - set_shape_if_empty(*dst, out_shape); - - _scale = scale; - _scale_exponent = 0; - _func_quantized = nullptr; - _func_int = nullptr; - _func_float = nullptr; - - bool is_scale_255 = false; - // Check and validate scaling factor - if(std::abs(scale - scale255_constant) < 0.00001f) - { - is_scale_255 = true; - } - else - { - int exponent = 0; - - std::frexp(scale, &exponent); - - // Store the positive exponent. We know that we compute 1/2^n - // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 - _scale_exponent = std::abs(exponent - 1); - } - - const DataType dt_input1 = src1->data_type(); - const DataType dt_input2 = src2->data_type(); - const DataType dt_output = dst->data_type(); - const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE); - - switch(dt_input1) - { - case DataType::QASYMM8: - if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8) - { - _func_quantized = &mul_saturate_quantized_8; - } - break; - case DataType::QASYMM8_SIGNED: - if(dt_input2 == DataType::QASYMM8_SIGNED) - { - _func_quantized = &mul_saturate_quantized_8; - ; - } - break; - case DataType::QSYMM16: - if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16) - { - _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16; - } - else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32) - { - _func_int = &mul_QSYMM16_QSYMM16_S32; - } - break; - case DataType::S16: - if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_S16_U8_S16 : &mul_S16_U8_S16; - } - else - { - _func_int = is_sat ? &mul_S16_U8_S16 : &mul_S16_U8_S16; - } - } - if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_S16_S16_S16 : &mul_S16_S16_S16; - } - else - { - _func_int = is_sat ? &mul_S16_S16_S16 : &mul_S16_S16_S16; - } - } - break; - case DataType::S32: - if(DataType::S32 == dt_input2 && DataType::S32 == dt_output) - { - _func_int = is_sat ? &mul_S32_S32_S32 : &mul_S32_S32_S32; - } - break; - case DataType::U8: - if(DataType::U8 == dt_input2 && DataType::U8 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_U8_U8_U8 : &mul_U8_U8_U8; - } - else - { - _func_int = is_sat ? &mul_U8_U8_U8 : &mul_U8_U8_U8; - } - } - else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_U8_U8_S16 : &mul_U8_U8_S16; - } - else - { - _func_int = is_sat ? &mul_U8_U8_S16 : &mul_U8_U8_S16; - } - } - else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_U8_S16_S16 : &mul_U8_S16_S16; - } - else - { - _func_int = is_sat ? &mul_U8_S16_S16 : &mul_U8_S16_S16; - } - } - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func_float = &mul_F16_F16_F16; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - _func_float = &mul_F32_F32_F32; - break; - default: - ARM_COMPUTE_ERROR("You called with the wrong img formats"); - } - - // Configure kernel window - Window win = calculate_max_window(out_shape); - - ICpuKernel::configure(win); -} - -Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); - - return Status{}; -} - -void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - if(_func_quantized != nullptr) - { - (*_func_quantized)(src1, src2, dst, window, _scale); - } - else if(_func_int != nullptr) - { - (*_func_int)(src1, src2, dst, window, _scale_exponent); - } - else - { - ARM_COMPUTE_ERROR_ON(_func_float == nullptr); - (*_func_float)(src1, src2, dst, window, _scale); - } -} -const char *CpuMulKernel::name() const -{ - return "CpuMulKernel"; -} -namespace -{ -Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - // Validate in case of configured dst - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); - } - - return Status{}; -} -} // namespace - -void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst)); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - // Auto initialize dst if not initialized - const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type()); - auto_init_if_empty(*dst, out_info); - - // Configure kernel window - Window win = calculate_max_window(out_shape); - - ICpuKernel::configure(win); -} - -Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst)); - - return Status{}; -} - -void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - c_mul_F32_F32_F32_n(src1, src2, dst, window); -} - -const char *CpuComplexMulKernel::name() const -{ - return "CpuComplexMulKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuMulKernel.h b/src/core/cpu/kernels/CpuMulKernel.h deleted file mode 100644 index 3ea176cc31..0000000000 --- a/src/core/cpu/kernels/CpuMulKernel.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_MUL_KERNEL_H -#define ARM_COMPUTE_CPU_MUL_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the kernel to perform multiplication between two tensors */ -class CpuMulKernel : public ICpuKernel -{ -public: - CpuMulKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMulKernel); - /** Initialise the kernel's input, dst and border mode. - * - * Valid configurations (Src1,Src2) -> Dst : - * - * Support: Broadcast? Scale=1/255? - * - (U8,U8) -> U8, S16 N Y - * - (U8,S16) -> S16 N Y - * - (S16,U8) -> S16 N Y - * - (S16,S16) -> S16 N Y - * - (S32,S32) -> S32 Y N - * - (F16,F16) -> F16 N Y - * - (F32,F32) -> F32 Y Y - * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y - * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y - * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in] src1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] src2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[out] dst Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype - * @param[in] rounding_policy Rounding policy. - */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuMulKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - - // Inherited methods overridden - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Common signature for all the specialised multiplication functions with integer scaling factor - * - * @param[in] src1 Src1 tensor object. - * @param[in] src2 Src2 tensor object. - * @param[out] dst Dst tensor object. - * @param[in] window Region on which to execute the kernel - * @param[in] scale Integer scale factor. - */ - using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale); - /** Common signature for all the specialised multiplication functions with float scaling factor - * - * @param[in] src1 Src1 tensor object. - * @param[in] src2 Src2 tensor object. - * @param[out] dst Dst tensor object. - * @param[in] window Region on which to execute the kernel - * @param[in] scale Float scale factor. - */ - using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); - /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor - * - * @param[in] src1 Src1 tensor object. - * @param[in] src2 Src2 tensor object. - * @param[out] dst Dst tensor object. - * @param[in] window Region on which to execute the kernel - * @param[in] scale Float scale factor. - * - */ - using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); - - MulFunctionFloat *_func_float{ nullptr }; - MulFunctionInt *_func_int{ nullptr }; - MulFunctionQuantized *_func_quantized{ nullptr }; - float _scale{ 0 }; - int _scale_exponent{ 0 }; -}; - -/** Interface for the complex pixelwise multiplication kernel. */ -class CpuComplexMulKernel : public ICpuKernel -{ -public: - CpuComplexMulKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexMulKernel); - /** Initialise the kernel's src, dst and border mode. - * - * @param[in] src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). - * @param[in] src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[out] dst The dst tensor, Data types supported: same as @p src1. Number of channels supported: same as @p src1. - */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuComplexMulKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_MUL_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuPermuteKernel.cpp b/src/core/cpu/kernels/CpuPermuteKernel.cpp deleted file mode 100644 index 270d6e222e..0000000000 --- a/src/core/cpu/kernels/CpuPermuteKernel.cpp +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuPermuteKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace -{ -#include "src/core/NEON/kernels/convolution/common/shims.hpp" -} // namespace - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -inline bool is_permutation_supported(const PermutationVector &v) -{ - static const std::array permutations2 = - { - { - PermutationVector(0U, 1U), - PermutationVector(1U, 0U), - } - }; - static const std::array permutations3 = - { - { - PermutationVector(2U, 0U, 1U), - PermutationVector(1U, 2U, 0U), - PermutationVector(0U, 1U, 2U), - PermutationVector(0U, 2U, 1U), - PermutationVector(1U, 0U, 2U), - PermutationVector(2U, 1U, 0U), - } - }; - static const std::array permutations4 = - { - { - PermutationVector(0U, 1U, 2U, 3U), - PermutationVector(1U, 0U, 2U, 3U), - PermutationVector(2U, 0U, 1U, 3U), - PermutationVector(0U, 2U, 1U, 3U), - PermutationVector(1U, 2U, 0U, 3U), - PermutationVector(2U, 1U, 0U, 3U), - PermutationVector(2U, 1U, 3U, 0U), - PermutationVector(1U, 2U, 3U, 0U), - PermutationVector(3U, 2U, 1U, 0U), - PermutationVector(2U, 3U, 1U, 0U), - PermutationVector(1U, 3U, 2U, 0U), - PermutationVector(3U, 1U, 2U, 0U), - PermutationVector(3U, 0U, 2U, 1U), - PermutationVector(0U, 3U, 2U, 1U), - PermutationVector(2U, 3U, 0U, 1U), - PermutationVector(3U, 2U, 0U, 1U), - PermutationVector(0U, 2U, 3U, 1U), - PermutationVector(2U, 0U, 3U, 1U), - PermutationVector(1U, 0U, 3U, 2U), - PermutationVector(0U, 1U, 3U, 2U), - PermutationVector(3U, 1U, 0U, 2U), - PermutationVector(1U, 3U, 0U, 2U), - PermutationVector(0U, 3U, 1U, 2U), - PermutationVector(3U, 0U, 1U, 2U) - } - }; - - return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) - || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v)); -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) -{ - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_permutation_supported(perm), "PermutationVector not supported."); - - const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); - - // Validate configured destination - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - return Status{}; -} - -template -void run_permute(const Window &window, const ITensor *src, const ITensor *dst, const PermutationVector &perm) -{ - const DataLayout src_layout = src->info()->data_layout(); - - // Source window - Window window_src = window; - - // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others - // we have to fall back to C++ - if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })) - { - window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start())); - window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start())); - window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start())); - window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start())); - } - - // Destination window - Window window_dst(window); - const Window::Dimension zero_window = Window::Dimension(0, 0, 0); - for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d) - { - window_dst.set(d, zero_window); - } - - // Create iterators - Iterator src_it(src, window_src); - Iterator dst_it(dst, window_dst); - - int in_row_stride = 0; - int in_col_stride = 0; - int in_channel_stride = 0; - int in_batch_stride = 0; - int n_cols = 0; - int n_rows = 0; - int n_channels = 0; - int n_batches = 0; - - switch(src_layout) - { - case DataLayout::NCHW: - { - in_row_stride = src->info()->strides_in_bytes().y() / sizeof(T); - in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T); - in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); - n_cols = src->info()->tensor_shape().x(); - n_rows = window_src.y().step(); - n_channels = src->info()->tensor_shape().z(); - n_batches = src->info()->tensor_shape()[3]; - break; - } - case DataLayout::NHWC: - { - in_col_stride = src->info()->strides_in_bytes().y() / sizeof(T); - in_row_stride = src->info()->strides_in_bytes().z() / sizeof(T); - in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); - n_channels = src->info()->tensor_shape().x(); - n_cols = window_src.y().step(); - n_rows = src->info()->tensor_shape().z(); - n_batches = src->info()->tensor_shape()[3]; - break; - } - default: - { - ARM_COMPUTE_ERROR("Invalid source data layout."); - break; - } - } - - // CHW -> HWC - if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) - { - const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T); - const int out_col_stride = dst->info()->strides_in_bytes().y() / sizeof(T); - const int out_row_stride = dst->info()->strides_in_bytes().z() / sizeof(T); - const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); - execute_window_loop(window_src, [&](const Coordinates & id) - { - const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride; - reorder::nchw_to_nhwc(reinterpret_cast(src_it.ptr()), reinterpret_cast(dst_it.ptr()) + idx, - n_batches, n_channels, n_rows, n_cols, - in_batch_stride, in_channel_stride, in_row_stride, - out_batch_stride, out_row_stride, out_col_stride); - }, - src_it, dst_it); - } - // HWC -> CHW - else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }) - { - const int out_col_stride = dst->info()->strides_in_bytes().x() / sizeof(T); - const int out_row_stride = dst->info()->strides_in_bytes().y() / sizeof(T); - const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T); - const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); - execute_window_loop(window_src, [&](const Coordinates & id) - { - const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride; - reorder::nhwc_to_nchw(reinterpret_cast(src_it.ptr()), reinterpret_cast(dst_it.ptr()) + idx, - n_batches, n_rows, n_cols, n_channels, - in_batch_stride, in_row_stride, in_col_stride, - out_batch_stride, out_channel_stride, out_row_stride); - }, - src_it, dst_it); - } - else - { - // All other cases fall back to C++ - // Permute strides - Strides strides = dst->info()->strides_in_bytes(); - Strides perm_strides = strides; - permute_strides(perm_strides, perm); - const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0; - execute_window_loop(window, [&](const Coordinates & id) - { - const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3; - *(reinterpret_cast(dst_it.ptr() + idx)) = *(reinterpret_cast(src_it.ptr())); - }, - src_it, dst_it); - } -} -} // namespace - -void CpuPermuteKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); - // Destination auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm)); - - _perm = perm; - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - - // This kernel doesn't need padding so update_window_and_padding() can be skipped - - ICpuKernel::configure(win); -} - -Status CpuPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm)); - return Status{}; -} - -void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - switch(src->info()->element_size()) - { - case 1: - run_permute(window, src, dst, _perm); - break; - case 2: - run_permute(window, src, dst, _perm); - break; - case 4: - run_permute(window, src, dst, _perm); - break; - default: - ARM_COMPUTE_ERROR("Element size not supported"); - break; - } -} - -const char *CpuPermuteKernel::name() const -{ - return "CpuPermuteKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuPermuteKernel.h b/src/core/cpu/kernels/CpuPermuteKernel.h deleted file mode 100644 index 2955f38960..0000000000 --- a/src/core/cpu/kernels/CpuPermuteKernel.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_PERMUTE_KERNEL_H -#define ARM_COMPUTE_CPU_PERMUTE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to perform tensor permutation given a permutation vector */ -class CpuPermuteKernel : public ICpuKernel -{ -public: - CpuPermuteKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPermuteKernel); - /** Configure kernel for a given list of arguments - * - * @note Arbitrary permutation vectors are supported with rank not greater than 4 - * - * @param[in] src Srouce tensor to permute. Data types supported: All - * @param[out] dst Destination tensor. Data types supported: Same as @p src - * @param[in] perm Permutation vector - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuPermuteKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - PermutationVector _perm{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_PERMUTE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuPool2dKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp deleted file mode 100644 index 27f4b950db..0000000000 --- a/src/core/cpu/kernels/CpuPool2dKernel.cpp +++ /dev/null @@ -1,516 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuPool2dKernel.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/pool2d/neon/list.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/ToolchainSupport.h" - -#include "src/core/NEON/wrapper/wrapper.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -using namespace misc::shape_calculator; - -struct PoolingSelectorData -{ - DataType dt; - DataLayout dl; - int pool_stride_x; - Size2D pool_size; -}; - -using PoolingSelectorPtr = std::add_pointer::type; -using PoolingKernelPtr = std::add_pointer::type; -struct PoolingKernel -{ - const char *name; - const PoolingSelectorPtr is_selected; - PoolingKernelPtr ukernel; -}; - -static const PoolingKernel available_kernels[] = -{ - { - "neon_qu8_nhwc_poolMxN", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc) - }, - { - "neon_qs8_nhwc_poolMxN", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc) - }, -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_f16_nhwc_poolMxN", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); }, - REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_fp32_nhwc_poolMxN", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); }, - REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc) - }, -#if defined(ENABLE_NCHW_KERNELS) - { - "neon_qu8_nchw_pool2", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw) - }, - { - "neon_qu8_nchw_pool3", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw) - }, - { - "neon_qu8_nchw_poolMxN", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw) - }, - { - "neon_qs8_nchw_pool2", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw) - }, - { - "neon_qs8_nchw_pool3", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw) - }, - { - "neon_qs8_nchw_poolMxN", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw) - }, -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_nchw_pool2", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, - REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw) - }, - { - "neon_fp16_nchw_pool3", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, - REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw) - }, - { - "neon_fp16_nchw_poolMxN", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); }, - REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_fp32_nchw_pool2", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, - REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw) - }, - { - "neon_fp32_nchw_pool3", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, - REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw) - }, - { - "neon_fp32_nchw_pool7", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); }, - REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw) - }, - { - "neon_fp32_nchw_poolMxN", - [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); }, - REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw) - }, -#endif /* defined(ENABLE_NCHW_KERNELS) */ -}; - -/** Micro-kernel selector - * - * @param[in] data Selection data passed to help pick the appropriate micro-kernel - * - * @return A matching micro-kernel else nullptr - */ -const PoolingKernel *get_implementation(DataType dt, DataLayout dl, int pool_stride_x, Size2D pool_size) -{ - for(const auto &uk : available_kernels) - { - if(uk.is_selected({ dt, dl, pool_stride_x, pool_size })) - { - return &uk; - } - } - return nullptr; -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, - const ITensorInfo *indices, Size2D pool_size) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0); - ARM_COMPUTE_RETURN_ERROR_ON(pool_size.y() == 0); - - int pool_stride_x = 0; - int pool_stride_y = 0; - int output_width = 0; - int output_height = 0; - PoolingType pool_type = pool_info.pool_type; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], - pool_size.x(), pool_size.y(), pool_info.pad_stride_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); - - TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type())); - std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - if(indices) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); - } - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() - && (src->data_layout() == DataLayout::NHWC), - "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"); - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); - if(indices) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info); - } - } - - const auto *uk = get_implementation(src->data_type(), src->data_layout(), pool_stride_x, pool_size); - ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info, - unsigned int &num_elems_processed_per_iteration, - BorderSize &border_size, - int pool_size_x, int pool_size_y) -{ - // dst auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info))); - if(indices) - { - // Indices auto inizialitation if not yet initialized - auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, - pool_info))) - .set_data_type(DataType::U32) /* we store the offset to the element */); - } - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - unsigned int num_elems_read_per_iteration = 0; - unsigned int num_elems_horizontal_window = 0; - int pool_stride_x = 0; - int pool_stride_y = 0; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int src_width = src->dimension(idx_width); - const int src_height = src->dimension(idx_height); - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_pad_right = pad_stride_info.pad_right(); - const int pool_pad_top = pad_stride_info.pad_top(); - const int pool_pad_left = pad_stride_info.pad_left(); - const int pool_pad_bottom = pad_stride_info.pad_bottom(); - const bool is_square = pool_size_x == pool_size_y; - const unsigned int pooled_w = dst->dimension(idx_width); - const unsigned int pooled_h = dst->dimension(idx_height); - - //If it's not squared and optimized will be executed the MxN - num_elems_read_per_iteration = 1; - num_elems_processed_per_iteration = 1; - num_elems_horizontal_window = 1; - - if(is_square) - { - switch(src->data_type()) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - switch(pool_size_x) - { - case 2: - num_elems_read_per_iteration = 16; - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; - num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; - break; - case 3: - num_elems_read_per_iteration = 16; - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14; - num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; - break; - default: - break; - } - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - switch(pool_size_x) - { - case 2: - case 3: - num_elems_read_per_iteration = 4; - num_elems_processed_per_iteration = 1; - num_elems_horizontal_window = 1; - break; - default: - break; - } - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - switch(pool_size_x) - { - case 2: - num_elems_read_per_iteration = 2; - break; - case 3: - num_elems_read_per_iteration = 4; // We use vload4 for pooling3 - break; - case 7: - num_elems_read_per_iteration = 8; // We use vload8 for pooling7 - break; - default: - break; - } - num_elems_processed_per_iteration = 1; - num_elems_horizontal_window = 1; - break; - default: - ARM_COMPUTE_ERROR("Element size not supported"); - break; - } - } - - bool window_changed = false; - Window win{}; - if(data_layout == DataLayout::NCHW) - { - // Number of iterations in X dimension - const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; - // Upper limit for the number of right/bottom border elements that are accessed - const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height; - border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); - border_size.right = std::max(upper_bound_w, pool_pad_right); - border_size.bottom = std::max(upper_bound_h, pool_pad_bottom); - TensorShape dst_shape{ src->tensor_shape() }; - dst_shape.set(0, pooled_w); - dst_shape.set(1, pooled_h); - TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape)); - win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration)); - AccessWindowStatic src_access(src, -pool_pad_left, -pool_pad_top, ceil_to_multiple(src_width + border_size.right, pool_size_x), src_height + border_size.bottom); - AccessWindowHorizontal dst_access(dst, 0, num_elems_horizontal_window); - if(indices) - { - AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window); - window_changed = update_window_and_padding(win, src_access, dst_access, indices_access); - } - else - { - window_changed = update_window_and_padding(win, src_access, dst_access); - } - dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); - - border_size = src->padding(); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} -} // namespace - -BorderSize CpuPool2dKernel::border_size() const -{ - return _border_size; -} - -void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - const bool is_global_pooling = pool_info.is_global_pooling; - - // Get data layout - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - // Update pool size in case of global pooling - const Size2D pool_size( - is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, - is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size)); - - const auto *uk = get_implementation(src->data_type(), src->data_layout(), pad_stride_info.stride().first, pool_size); - ARM_COMPUTE_ERROR_ON(uk == nullptr); - - // Set instance variables - _pool_info = pool_info; - _data_layout = src->data_layout(); - _pool_size = pool_size; - _pool_stride_x = pad_stride_info.stride().first; - _run_method = uk->ukernel; - _name = std::string("CpuPool2dKernel").append("/").append(uk->name); - - if(_data_layout == DataLayout::NHWC) - { - // Configure kernel window - Window win = calculate_max_window(*dst, Steps()); - ICpuKernel::configure(win); - } - else - { - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration, - _border_size, pool_size.x(), pool_size.y()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICpuKernel::configure(win_config.second); - } -} - -Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - - unsigned int num_elems_processed_per_iteration = 0; - BorderSize border_size(0); - - const bool is_global_pooling = pool_info.is_global_pooling; - - // Get data layout - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y))); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), - (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, - pool_size_x, pool_size_y) - .first); - - return Status{}; -} - -void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST_0); - ITensor *indices = tensors.get_tensor(TensorType::ACL_DST_1); - - const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first; - const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second; - const unsigned int pool_size = _pool_info.pool_size.width; - - Window window_src(window); - if(_data_layout == DataLayout::NCHW) - { - // Set step for src in x and y direction for the src - unsigned int window_x_inc = 0; - switch(src->info()->data_type()) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - { - window_x_inc = pool_stride_x; - if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3) - { - window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; - } - break; - } - - case DataType::F16: - case DataType::F32: - { - window_x_inc = pool_stride_x; - break; - } - default: - { - ARM_COMPUTE_ERROR("Not supported"); - } - } - window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc)); - window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y)); - } - else - { - window_src.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); - window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); - } - _run_method(src, dst, indices, _pool_info, window_src, window); -} - -const char *CpuPool2dKernel::name() const -{ - return _name.c_str(); -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuPool2dKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h deleted file mode 100644 index 9ed398b907..0000000000 --- a/src/core/cpu/kernels/CpuPool2dKernel.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H -#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the pooling layer kernel */ -class CpuPool2dKernel : public ICpuKernel -{ -public: - CpuPool2dKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel); - /** Configure kernel for a given list of arguments - * - * @note F16 are supported for pool sizes 2 and 3 only - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: Same as @p src. - * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. - * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. - */ - void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuPool2dKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - const char *name() const override; - -private: - using PoolingKernelPtr = std::add_pointer::type; - -private: - PoolingLayerInfo _pool_info{}; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - unsigned int _num_elems_processed_per_iteration{ 0 }; - BorderSize _border_size{ 0 }; - Size2D _pool_size{}; - int _pool_stride_x{}; - PoolingKernelPtr _run_method{ nullptr }; - std::string _name{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_POOL2D_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.cpp b/src/core/cpu/kernels/CpuQuantizeKernel.cpp deleted file mode 100644 index 8ca81e8b11..0000000000 --- a/src/core/cpu/kernels/CpuQuantizeKernel.cpp +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuQuantizeKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "src/core/CPP/Validate.h" - -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -constexpr auto window_step = 16; - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - - return Status{}; -} - -template -inline float32x4x4_t load_value(const T *input_ptr) -{ - using Tx16_t = typename wrapper::traits::neon_vector::type; - return arm_compute::convert_to_float32x4x4(wrapper::vloadq(input_ptr)); -} - -template <> -inline float32x4x4_t load_value(const float *input_ptr) -{ - return { wrapper::vloadq(input_ptr), - wrapper::vloadq(input_ptr + 4), - wrapper::vloadq(input_ptr + 8), - wrapper::vloadq(input_ptr + 12) }; -} -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline float32x4x4_t load_value(const float16_t *input_ptr) -{ - return { vcvt_f32_f16(wrapper::vload(input_ptr)), - vcvt_f32_f16(wrapper::vload(input_ptr + 4)), - vcvt_f32_f16(wrapper::vload(input_ptr + 8)), - vcvt_f32_f16(wrapper::vload(input_ptr + 12)) }; -} - -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template -using vector_type = wrapper::traits::neon_vector_t; - -template -vector_type vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi); - -template <> -vector_type vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - return vquantize(qv, qi); -} - -template <> -vector_type vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - return vquantize_signed(qv, qi); -} - -} // namespace - -void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - - static const std::map quant_map = - { - { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, - - { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, - - { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8 }, - { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16 }, -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ - }; - - std::string function_to_call("op_"); - function_to_call += string_from_data_type(src->data_type()) + "_"; - function_to_call += string_from_data_type(dst->data_type()); - - auto it = quant_map.find(function_to_call); - - if(it == quant_map.end()) - { - ARM_COMPUTE_ERROR("Unsupported combination of input and output data types"); - } - _func = it->second; - - // Configure kernel window - Window win_config = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win_config); -} - -Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); - return Status{}; -} - -template -void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if(is_data_type_quantized_asymmetric(src->info()->data_type())) - { - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - } -#ifdef __aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; -#else //__aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; -#endif //__aarch64__ - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast(input.ptr()); - auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8(load_value(&input_ptr[x]), uqinfo)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - output_ptr[x] = Qasymm8QuantizationHelper::quantize(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); -} - -template -void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if(is_data_type_quantized_asymmetric(src->info()->data_type())) - { - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - } -#ifdef __aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; -#else //__aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; -#endif //__aarch64__ - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast(input.ptr()); - auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step); x += window_step) - { - uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); - vst1q_u16(&output_ptr[x], tmp.val[0]); - vst1q_u16(&output_ptr[x + 8], tmp.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); -} - -void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - (this->*_func)(src, dst, window); -} - -const char *CpuQuantizeKernel::name() const -{ - return "CpuQuantizeKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.h b/src/core/cpu/kernels/CpuQuantizeKernel.h deleted file mode 100644 index 834a2e03d2..0000000000 --- a/src/core/cpu/kernels/CpuQuantizeKernel.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H -#define ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the quantization layer kernel. - * - * @note The implementation supports only 3D input tensors - */ -class CpuQuantizeKernel : public ICpuKernel -{ -public: - CpuQuantizeKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizeKernel); - /** Set the input, output. - * - * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. - * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16. - * - * @note Output auto initialization is not supported by this kernel - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuQuantizeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Common signature for all the specialised @ref CpuQuantizeKernel functions - * - * @param[in] window Region on which to execute the kernel. - */ - using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window); - /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor. - * - * @param[in] window Region on which to execute the kernel. - */ - template - void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window); - /** Function to apply QASYMM16 quantization on a tensor. - * - * @param[in] window Region on which to execute the kernel. - */ - template - void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window); - - QuantizeFunctionExecutorPtr _func{ nullptr }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuReshapeKernel.cpp b/src/core/cpu/kernels/CpuReshapeKernel.cpp deleted file mode 100644 index 5b717b9bba..0000000000 --- a/src/core/cpu/kernels/CpuReshapeKernel.cpp +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuReshapeKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/INEKernel.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -/** [NEReshapeLayerKernel Kernel] **/ -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - - if(dst->tensor_shape().total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size()); - } - - return Status{}; -} - -template -inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst) -{ - const TensorShape &src_shape = src->info()->tensor_shape(); - const TensorShape &dst_shape = dst->info()->tensor_shape(); - Coordinates dst_coord{}; - - Iterator src_it(src, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - dst_coord = index2coords(dst_shape, coords2index(src_shape, id)); - *reinterpret_cast(dst->ptr_to_element(dst_coord)) = *reinterpret_cast(src_it.ptr()); - }, - src_it); -} -} // namespace - -void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - ARM_COMPUTE_UNUSED(dst); - - // Configure kernel window - Window win = calculate_max_window(*src); - - ICpuKernel::configure(win); -} - -Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); - - return Status{}; -} - -void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - switch(src->info()->data_type()) - { - case DataType::U8: - case DataType::S8: - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - reshape_tensor(window, src, dst); - break; - case DataType::U16: - case DataType::S16: - case DataType::F16: - reshape_tensor(window, src, dst); - break; - case DataType::U32: - case DataType::S32: - case DataType::F32: - reshape_tensor(window, src, dst); - break; - default: - ARM_COMPUTE_ERROR("Unsupported data type!"); - } -} - -const char *CpuReshapeKernel::name() const -{ - return "CpuReshapeKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -/** [NEReshapeLayerKernel Kernel] **/ diff --git a/src/core/cpu/kernels/CpuReshapeKernel.h b/src/core/cpu/kernels/CpuReshapeKernel.h deleted file mode 100644 index 1425fbe917..0000000000 --- a/src/core/cpu/kernels/CpuReshapeKernel.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_RESHAPE_KERNEL_H -#define ARM_COMPUTE_CPU_RESHAPE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the kernel to perform tensor reshaping */ -class CpuReshapeKernel : public ICpuKernel -{ -public: - CpuReshapeKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuReshapeKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] src Source tensor info. Data type supported: All - * @param[out] dst Destination tensor info. Data type supported: Same as @p input - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuReshapeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_RESHAPE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp deleted file mode 100644 index 0c1f08ab79..0000000000 --- a/src/core/cpu/kernels/CpuScaleKernel.cpp +++ /dev/null @@ -1,623 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuScaleKernel.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Utility.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/scale/neon/list.h" -#include "src/core/cpu/kernels/scale/sve/list.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -struct ScaleSelectorData -{ - DataType dt; - const CPUInfo &ci; -}; -using ScaleSelectorPtr = std::add_pointer::type; -using ScaleKernelPtr = std::add_pointer::type; -struct ScaleKernel -{ - const char *name; - const ScaleSelectorPtr is_selected; - ScaleKernelPtr ukernel; -}; - -static const ScaleKernel available_kernels[] = -{ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp16_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale) - }, - { - "sve_fp32_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale) - }, - { - "sve_qu8_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve(); }, - REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale) - }, - { - "sve_qs8_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve(); }, - REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale) - }, - { - "sve_u8_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale) - }, - { - "sve_s16_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, - REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, - REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale) - }, -#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_fp32_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale) - }, - { - "neon_qu8_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale) - }, - { - "neon_qs8_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale) - }, - { - "neon_u8_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale) - }, - { - "neon_s16_scale", - [](const ScaleSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ -}; - -/** Micro-kernel selector - * - * @param[in] data Selection data passed to help pick the appropriate micro-kernel - * - * @return A matching micro-kernel else nullptr - */ -const ScaleKernel *get_implementation(const ScaleSelectorData &data) -{ - for(const auto &uk : available_kernels) - { - if(uk.is_selected(data)) - { - return &uk; - } - } - return nullptr; -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, - const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info) -{ - const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() }); - ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(dst == src); - ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); - ARM_COMPUTE_UNUSED(info.constant_border_value); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported"); - - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - const auto width_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const auto height_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const auto output_width = dst->dimension(width_index); - const auto output_height = dst->dimension(height_index); - ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0); - ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0); - - if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); - } - - if(info.interpolation_policy == InterpolationPolicy::BILINEAR) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); - if(dx != nullptr && dy != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32); - } - } - - ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); - - if(info.interpolation_policy == InterpolationPolicy::AREA) - { - ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8); - } - - return Status{}; -} -} // namespace - -void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, - ITensorInfo *dst, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(dx, dy, offsets); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - dx, - dy, - offsets, - dst, - info)); - - const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() }); - ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - - _run_method = uk->ukernel; - _name = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy)); - - // Get data layout and width/height indices - _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - _policy = info.interpolation_policy; - _border_mode = info.border_mode; - _constant_border_value = info.constant_border_value; - _align_corners = info.align_corners; - - if(info.sampling_policy == SamplingPolicy::CENTER) - { - _sampling_offset = 0.5f; - } - - // Compute the ratio between source width/height and destination width/height - const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners); - const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners); - - // Area interpolation behaves as Nearest Neighbour in case of up-sampling - _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy; - - if(_border_mode == BorderMode::UNDEFINED) - { - _border_mode = BorderMode::CONSTANT; - _constant_border_value = PixelValue(); - } - -#ifdef ENABLE_NCHW_KERNELS - // Configure scale function to run - if(_data_layout == DataLayout::NCHW) - { - std::string function_to_call("scale_"); - function_to_call += string_from_data_type(src->data_type()) + "_"; - function_to_call += string_from_data_layout(_data_layout) + "_"; - function_to_call += string_from_interpolation_policy(_policy); - - static std::map map_function = - { - { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 }, - - { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw }, - { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, - - { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm }, - { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, - - { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm }, - { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, - - { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw }, - { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw }, - { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - - { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw }, - { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw }, - }; - auto it = map_function.find(function_to_call); - if(it != map_function.end()) - { - _func = it->second; - } - } -#endif // ENABLE_NCHW_KERNELS - - // Configure window - Window win = calculate_max_window(*dst, Steps()); - ICpuKernel::configure(win); -} - -#ifdef ENABLE_NCHW_KERNELS -template -void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) -{ - ARM_COMPUTE_UNUSED(dx, dy); - const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - - // Set offsets window - Window win_off; - win_off.set(Window::DimX, window[Window::DimX]); - win_off.set(Window::DimY, window[Window::DimY]); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - // Create iterators - Iterator src_i(src, win_in); - Iterator dst_i(dst, window); - Iterator offsets_i(offsets, win_off); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets_i.ptr()); - const auto in_yi = static_cast(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor(( - id.y() + _sampling_offset) - * hr)); - const int32_t offset_row = in_yi * in_stride_x; - *reinterpret_cast(dst_i.ptr()) = *(reinterpret_cast(src_i.ptr()) + offsets_ptr[0] + offset_row); - }, - src_i, offsets_i, dst_i); -} - -template -void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); - Window win_off; - win_off.set(Window::DimX, window.x()); - win_off.set(Window::DimY, window.y()); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - Iterator src_i(src, win_in); - Iterator dst_i(dst, window); - Iterator offsets_i(offsets, win_off); - Iterator dx_i(dx, win_off); - Iterator dy_i(dy, win_off); - - const int32_t in_dim_w = src->info()->dimension(0); - const int32_t in_dim_h = src->info()->dimension(1); - const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right; - - if(_border_mode == BorderMode::CONSTANT) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - using ConstType = typename std::conditional::value, half, T>::type; -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - using ConstType = T; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - const T const_border_value = static_cast(_constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const auto index_w = *(reinterpret_cast(offsets_i.ptr())); - const auto dx_val = *(reinterpret_cast(dx_i.ptr())); - const auto dy_val = *(reinterpret_cast(dy_i.ptr())); - const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h - && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h - && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) : - const_border_value; - - *reinterpret_cast(dst_i.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - src_i, offsets_i, dx_i, dy_i, dst_i); - } - else if(_border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const auto index_w = *(reinterpret_cast(offsets_i.ptr())); - const auto dx_val = *(reinterpret_cast(dx_i.ptr())); - const auto dy_val = *(reinterpret_cast(dy_i.ptr())); - const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); - - auto clamped_x = utility::clamp(index_w, 0, in_dim_w - 1); - auto clamped_x1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); - auto clamped_y = utility::clamp(index_h, 0, in_dim_h - 1); - auto clamped_y1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w); - const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w); - const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w); - const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w); - - *reinterpret_cast(dst_i.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - src_i, offsets_i, dx_i, dy_i, dst_i); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} - -void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) -{ - ARM_COMPUTE_UNUSED(dx, dy, offsets); - using namespace scale_helpers; - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8); - - // Don't increment in width/height/channels for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Iterator src_i(src, win_in); - Iterator dst_i(dst, window); - - const auto wr = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners); - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); - const auto w = src->info()->dimension(0); - const auto h = src->info()->dimension(1); - const size_t in_stride = src->info()->strides_in_bytes()[1]; - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(src_i.ptr()); - - uint8x8_t tmp0 = vdup_n_u8(0); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7); - - uint8x8_t tmp1 = vdup_n_u8(0); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7); - - vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1)); - }, - src_i, dst_i); -} - -template -void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) -{ - // Get data layout and width/height indices - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners); - Window win_off; - win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(idx_width, Window::Dimension(0, 0, 0)); - win_in.set(idx_height, Window::Dimension(0, 0, 0)); - - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - Iterator src_i(src, win_in); - Iterator dst_i(dst, window); - - const int32_t in_dim_w = src->info()->dimension(idx_width); - const int32_t in_dim_h = src->info()->dimension(idx_height); - const int32_t stride_w = src->info()->strides_in_bytes()[idx_width]; - const int32_t stride_h = src->info()->strides_in_bytes()[idx_height]; - - const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - if(_border_mode == BorderMode::CONSTANT) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - using ConstType = typename std::conditional::value, half, T>::type; -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - using ConstType = T; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - const T const_border_value = static_cast(_constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(dst_i.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - src_i, dst_i); - } - else if(_border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(src_i.ptr()); - - auto clamped_w = utility::clamp(index_w, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(index_h, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); - const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); - const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); - const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(dst_i.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - src_i, dst_i); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -#endif // ENABLE_NCHW_KERNELS - -Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, - const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info)); - return Status{}; -} - -void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr && _data_layout == DataLayout::NCHW); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr && _data_layout == DataLayout::NHWC); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - const auto dx = tensors.get_const_tensor(TensorType::ACL_INT_0); - const auto dy = tensors.get_const_tensor(TensorType::ACL_INT_1); - const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2); - - if(_data_layout == DataLayout::NCHW) - { - (this->*_func)(src, dst, dx, dy, offsets, window); - } - else - { - _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window); - } -} - -const char *CpuScaleKernel::name() const -{ - return _name.c_str(); -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuScaleKernel.h b/src/core/cpu/kernels/CpuScaleKernel.h deleted file mode 100644 index a2b65370ba..0000000000 --- a/src/core/cpu/kernels/CpuScaleKernel.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_SCALEKERNEL_H -#define ARM_COMPUTE_CPU_SCALEKERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Arm(R) Neon(TM) kernel to perform scaling on a tensor */ -class CpuScaleKernel : public ICpuKernel -{ -public: - CpuScaleKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel); - /** Initialise the kernel's inputs, output and interpolation policy - * - * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor - * @note Using @p policy Area only supports data layout NCHW and input data type U8. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. - * @param[in] dx Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32 - * @param[in] dy Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32 - * @param[in] offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32. - * @param[out] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] info @ref ScaleKernelInfo to use for configuration - */ - void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, - const ScaleKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuScaleKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, - const ScaleKernelInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: -#ifdef ENABLE_NCHW_KERNELS - /** function to perform scale using area interpolation on the given window - * - * @note Used only in case down-sampling. - */ - void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); - - /** function to perform scale using bilinear interpolation on the given window */ - template - void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); - /** function to perform scale using bilinear interpolation on the given window */ - template - void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); - - /** function to perform scale using nearest neighbour on the given window */ - template - void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); -#endif // ENABLE_NCHW_KERNELS - - /** Scale function to use for the particular function to use */ - using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window); - using ScaleKernelPtr = std::add_pointer::type; - - ScaleFunctionPtr _func{ nullptr }; - InterpolationPolicy _policy{}; - BorderMode _border_mode{}; - PixelValue _constant_border_value{}; - float _sampling_offset{ 0 }; - bool _align_corners{ false }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - ScaleKernelPtr _run_method{ nullptr }; - std::string _name{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SCALEKERNEL_H */ diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp deleted file mode 100644 index c562699092..0000000000 --- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp +++ /dev/null @@ -1,378 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuSoftmaxKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/softmax/impl/neon/list.h" -#include "src/core/cpu/kernels/softmax/impl/sve/list.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -struct SoftmaxSelectorData -{ - DataType dt; - const CPUInfo &ci; -}; -using SoftmaxSelectorPtr = std::add_pointer::type; -using SoftmaxLogits1DMaxKernelPtr = std::add_pointer::type; -using SoftmaxLogits1DKernelPtr = std::add_pointer::type; - -struct SoftmaxLogits1DKernel -{ - const char *name; - const SoftmaxSelectorPtr is_selected; - SoftmaxLogits1DKernelPtr ukernel; -}; - -struct SoftmaxLogits1DMaxKernel -{ - const char *name; - const SoftmaxSelectorPtr is_selected; - SoftmaxLogits1DMaxKernelPtr ukernel; -}; - -static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = -{ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp32_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, - REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float) - }, - { - "sve_fp16_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, - REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ - -#if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_fp32_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float) - }, -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ - -#if defined(ARM_COMPUTE_ENABLE_SVE2) - { - "sve2_qu8_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized) - }, - { - "sve2_qs8_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); }, - REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ - { - "neon_qu8_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized) - }, - { - "neon_qs8_softmax_logits_1d", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized) - }, -}; - -static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = -{ -#if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp32_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, - REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max) - }, - { - "sve_fp16_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, - REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max) - }, - { - "sve_qu8_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); }, - REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max) - }, - { - "sve_qs8_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); }, - REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ -#if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_fp32_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max) - }, -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_qu8_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max) - }, - { - "neon_qs8_logits_1d_max", - [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max) - }, -#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ -}; - -const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data) -{ - for(const auto &uk : available_logits_1d_kernels) - { - if(uk.is_selected({ data.dt, CPUInfo::get() })) - { - return &uk; - } - } - return nullptr; -} - -const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data) -{ - for(const auto &uk : available_logits_1d_max_kernels) - { - if(uk.is_selected({ data.dt, CPUInfo::get() })) - { - return &uk; - } - } - return nullptr; -} - -Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - - // Validate in case of configured output - if(output.total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1)); - } - - return Status{}; -} - -} // namespace - -void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst)); - - // Softmax across the x dimension - const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1); - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - - const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); - ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - - _run_method = uk->ukernel; - _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name); - - Window win = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win); -} - -Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst)); - - return Status{}; -} - -void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - _run_method(src, dst, window); -} - -const char *CpuLogits1DMaxKernel::name() const -{ - return _name.c_str(); -} - -namespace -{ -Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max, - const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log) -{ - ARM_COMPUTE_UNUSED(beta); - // Check input - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); - - // Check max - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max); - - // Check output if configured - if(dst.total_size() != 0) - { - const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info(); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); - ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization); - } - - // Check tmp if configured - if(tmp.total_size() != 0) - { - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type(); - ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); - // We could potentially reduce tmp memory if we could predict or make an assumption - // on the maximum number of threads that will run in parallel. - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp); - } - - return Status{}; -} -} // namespace - -template -void CpuLogits1DSoftmaxKernel::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); - - // Configure kernel window - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); - - // Output auto initialization if not yet initialized - const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info(); - auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); - - // Tmp auto initialization if not yet initialized - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); - auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); - - const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); - ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - - std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); - - _beta = beta; - _run_method = uk->ukernel; - _name = kernel_name.append("/").append(uk->name); - - // Configure kernel window - Window win = calculate_max_window(*max, Steps()); - - ICpuKernel::configure(win); -} - -template -Status CpuLogits1DSoftmaxKernel::validate(const ITensorInfo *src, const ITensorInfo *max, - const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); - - return Status{}; -} - -template -void CpuLogits1DSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto max = tensors.get_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); - - const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); - const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; - - ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); - - void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); - _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); -} - -template -const char *CpuLogits1DSoftmaxKernel::name() const -{ - return _name.c_str(); -} - -template class CpuLogits1DSoftmaxKernel; -template class CpuLogits1DSoftmaxKernel; - -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.h b/src/core/cpu/kernels/CpuSoftmaxKernel.h deleted file mode 100644 index 776c0d6f79..0000000000 --- a/src/core/cpu/kernels/CpuSoftmaxKernel.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H -#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the identifying the max value of 1D Logits */ -class CpuLogits1DMaxKernel : public ICpuKernel -{ -public: - CpuLogits1DMaxKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel); - /** Set the input and output tensors. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p input - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuLogits1DMaxKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - using SoftmaxLogits1DMaxKernelPtr = std::add_pointer::type; - -private: - SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr }; - std::string _name{}; -}; - -/** Interface for softmax computation for QASYMM8 with pre-computed max. */ -template -class CpuLogits1DSoftmaxKernel : public ICpuKernel -{ -public: - CpuLogits1DSoftmaxKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel); - - /** Set the input and output tensors. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. - * Data types supported: same as @p input. - * @param[out] dst Destination tensor info. Data types supported: same as @p input. - * @param[in] beta A scaling factor for the exponent. - * - * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. - */ - void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuLogits1DSoftmaxKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *max, - const ITensorInfo *dst, const float beta, const ITensorInfo *tmp); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - using SoftmaxLogits1DKernelPtr = std::add_pointer::type; - -private: - float _beta{ 1.0f }; - SoftmaxLogits1DKernelPtr _run_method{ nullptr }; - std::string _name{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuSubKernel.cpp b/src/core/cpu/kernels/CpuSubKernel.cpp deleted file mode 100644 index fa7a55805e..0000000000 --- a/src/core/cpu/kernels/CpuSubKernel.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuSubKernel.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/sub/neon/list.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -struct SubSelectorData -{ - DataType dt; -}; - -using SubSelectorPtr = std::add_pointer::type; -using SubKernelPtr = std::add_pointer::type; - -struct SubKernel -{ - const char *name; - const SubSelectorPtr is_selected; - SubKernelPtr ukernel; -}; - -static const SubKernel available_kernels[] = -{ - { - "neon_fp32_sub", - [](const SubSelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon) - }, -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_sub", - [](const SubSelectorData & data) { return (data.dt == DataType::F16); }, - REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon) - }, -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_u8_sub", - [](const SubSelectorData & data) { return (data.dt == DataType::U8); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) - }, - { - "neon_s16_sub", - [](const SubSelectorData & data) { return (data.dt == DataType::S16); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) - }, - { - "neon_s32_sub", - [](const SubSelectorData & data) { return (data.dt == DataType::S32); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) - }, - { - "neon_qu8_sub", - [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon) - }, - { - "neon_qs8_sub", - [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon) - }, - { - "neon_qs16_sub", - [](const SubSelectorData & data) { return (data.dt == DataType::QSYMM16); }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon) - }, -}; - -/** Micro-kernel selector - * - * @param[in] data Selection data passed to help pick the appropriate micro-kernel - * - * @return A matching micro-kernel else nullptr - */ -const SubKernel *get_implementation(DataType dt) -{ - for(const auto &uk : available_kernels) - { - if(uk.is_selected({ dt })) - { - return &uk; - } - } - return nullptr; -} - -inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) -{ - ARM_COMPUTE_UNUSED(policy); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); - - const auto *uk = get_implementation(src0.data_type()); - ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src0.data_type()) && (policy == ConvertPolicy::WRAP), - "Convert policy cannot be WRAP if datatype is quantized"); - - // Validate in case of configured dst - if(dst.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), - "Wrong shape for dst"); - } - return Status{}; -} -} // namespace - -void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape()); - - // Auto initialize dst if not initialized - set_shape_if_empty(*dst, out_shape); - set_data_type_if_unknown(*dst, src0->data_type()); - - const auto *uk = get_implementation(src0->data_type()); - ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - - _policy = policy; - _run_method = uk->ukernel; - _name = std::string("CpuSubKernel").append("/").append(uk->name); - - // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped - Window win = calculate_max_window(out_shape, Steps()); - - ICpuKernel::configure(win); -} - -Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); - - return Status{}; -} - -void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); - const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - - _run_method(src0, src1, dst, _policy, window); -} - -const char *CpuSubKernel::name() const -{ - return _name.c_str(); -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuSubKernel.h b/src/core/cpu/kernels/CpuSubKernel.h deleted file mode 100644 index cb64e64cfa..0000000000 --- a/src/core/cpu/kernels/CpuSubKernel.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_SUB_KERNEL_H -#define ARM_COMPUTE_CPU_SUB_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the kernel to perform subtraction between two tensors */ -class CpuSubKernel : public ICpuKernel -{ -public: - CpuSubKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSubKernel); - - /** Initialise the kernel's src and dst. - * - * Valid configurations (src0,src1) -> dst : - * - * - (U8,U8) -> U8 - * - (QASYMM8, QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - * @param[in] src0 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] src1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[out] dst The dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32. - * @param[in] policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuSubKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - using SubKernelPtr = std::add_pointer::type; - -private: - ConvertPolicy _policy{}; - SubKernelPtr _run_method{ nullptr }; - std::string _name{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SUB_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuTransposeKernel.cpp b/src/core/cpu/kernels/CpuTransposeKernel.cpp deleted file mode 100644 index c7cafe94a8..0000000000 --- a/src/core/cpu/kernels/CpuTransposeKernel.cpp +++ /dev/null @@ -1,510 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuTransposeKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -unsigned int num_elems_processed(size_t element_size) -{ - switch(element_size) - { - case 1: - return 8; - case 2: - case 4: - return 4; - default: - break; - } - - ARM_COMPUTE_ERROR("Element size not supported"); -} - -void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window) -{ - const int window_step_x = 8; - const int window_step_y = 8; - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_start_y = window.y().start(); - const int window_end_y = std::min(window.y().end(), static_cast(in->info()->dimension(1))); - const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; - const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; - const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; - - // Check if we need a left-over loop for the y dimension - bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); - - Window window_in(window); - window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) - { - // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) - { - window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); - } - else - { - window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); - } - } - - Window window_out(window); - window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Iterator output(out, window_out); - - // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) - { - Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 8x8 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x8_t row0 = vld1_u8(reinterpret_cast(input.ptr() + x + 0 * input_stride_in_bytes)); - const uint8x8_t row1 = vld1_u8(reinterpret_cast(input.ptr() + x + 1 * input_stride_in_bytes)); - const uint8x8_t row2 = vld1_u8(reinterpret_cast(input.ptr() + x + 2 * input_stride_in_bytes)); - const uint8x8_t row3 = vld1_u8(reinterpret_cast(input.ptr() + x + 3 * input_stride_in_bytes)); - const uint8x8_t row4 = vld1_u8(reinterpret_cast(input.ptr() + x + 4 * input_stride_in_bytes)); - const uint8x8_t row5 = vld1_u8(reinterpret_cast(input.ptr() + x + 5 * input_stride_in_bytes)); - const uint8x8_t row6 = vld1_u8(reinterpret_cast(input.ptr() + x + 6 * input_stride_in_bytes)); - const uint8x8_t row7 = vld1_u8(reinterpret_cast(input.ptr() + x + 7 * input_stride_in_bytes)); - - // Transpose 2x2 - const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1); - const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3); - const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5); - const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7); - - // Transpose 4x4 - const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0])); - const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1])); - const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0])); - const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1])); - - // Transpose 8x8 - const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0])); - const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1])); - const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0])); - const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1])); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; - - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1]))); - vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1]))); - } - - // Compute left-over elements along the x dimension (1x8) - for(; x < window_end_x; ++x) - { - const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes); - const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes); - const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes); - const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes); - const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes); - const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes); - const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes); - const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes); - - uint8x8_t result = vdup_n_u8(0); - result = vset_lane_u8(val0, result, 0); - result = vset_lane_u8(val1, result, 1); - result = vset_lane_u8(val2, result, 2); - result = vset_lane_u8(val3, result, 3); - result = vset_lane_u8(val4, result, 4); - result = vset_lane_u8(val5, result, 5); - result = vset_lane_u8(val6, result, 6); - result = vset_lane_u8(val7, result, 7); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; - - vst1_u8(output.ptr() + dst_offset_in_bytes, result); - } - }, - input, output); - } - - if(left_over_loop_y) - { - window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); - window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); - - Iterator input(in, window_in); - Iterator output(out, window_out); - - // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint8_t val0 = *input.ptr(); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes; - - *(output.ptr() + dst_offset_in_bytes) = val0; - }, - input, output); - } -} - -void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window) -{ - const int window_step_x = 4; - const int window_step_y = 4; - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_start_y = window.y().start(); - const int window_end_y = std::min(window.y().end(), static_cast(in->info()->dimension(1))); - const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; - const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; - const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; - - // Check if we need a left-over loop for the y dimension - bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); - - Window window_in(window); - window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) - { - // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) - { - window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); - } - else - { - window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); - } - } - - Window window_out(window); - window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Iterator output(out, window_out); - - // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) - { - Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 4x4 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint16x4_t row0 = vld1_u16(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint16x4_t row1 = vld1_u16(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint16x4_t row2 = vld1_u16(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint16x4_t row3 = vld1_u16(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - - // Transpose 2x2 - const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1); - const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3); - - // Transpose 4x4 - const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0])); - const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1])); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; - - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0])); - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0])); - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1])); - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1])); - } - - // Compute left-over elements (1x4) - for(; x < window_end_x; ++x) - { - const uint16_t val0 = *(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint16_t val1 = *(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint16_t val2 = *(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint16_t val3 = *(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - - uint16x4_t result = vdup_n_u16(0); - result = vset_lane_u16(val0, result, 0); - result = vset_lane_u16(val1, result, 1); - result = vset_lane_u16(val2, result, 2); - result = vset_lane_u16(val3, result, 3); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; - - vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes), result); - } - }, - input, output); - } - - if(left_over_loop_y) - { - window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); - window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); - - Iterator input(in, window_in); - Iterator output(out, window_out); - - // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint16_t val0 = *(reinterpret_cast(input.ptr())); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes; - - *(reinterpret_cast(output.ptr() + dst_offset_in_bytes)) = val0; - }, - input, output); - } -} - -void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window) -{ - const int window_step_x = 4; - const int window_step_y = 4; - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_start_y = window.y().start(); - const int window_end_y = std::min(window.y().end(), static_cast(in->info()->dimension(1))); - const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; - const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; - const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; - - // Check if we need a left-over loop for the y dimension - bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); - - Window window_in(window); - window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) - { - // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) - { - window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); - } - else - { - window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); - } - } - - Window window_out(window); - window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Iterator output(out, window_out); - - // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) - { - Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 4x4 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint32x4_t row0 = vld1q_u32(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32x4_t row1 = vld1q_u32(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32x4_t row2 = vld1q_u32(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32x4_t row3 = vld1q_u32(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - - // Transpose 2x2 - const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1)); - const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3)); - const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1)); - const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3)); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - // Swap block 01 with block 10 and store - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0])); - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1])); - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0])); - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1])); - } - - // Compute left-over elements (1x4) - for(; x < window_end_x; ++x) - { - const uint32_t val0 = *(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32_t val1 = *(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32_t val2 = *(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32_t val3 = *(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes) + x); - - uint32x4_t result = vdupq_n_u32(0); - result = vsetq_lane_u32(val0, result, 0); - result = vsetq_lane_u32(val1, result, 1); - result = vsetq_lane_u32(val2, result, 2); - result = vsetq_lane_u32(val3, result, 3); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes), result); - } - }, - input, output); - } - - if(left_over_loop_y) - { - window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); - window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); - - Iterator input(in, window_in); - Iterator output(out, window_out); - - // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint32_t val0 = *(reinterpret_cast(input.ptr())); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; - - *(reinterpret_cast(output.ptr() + dst_offset_in_bytes)) = val0; - }, - input, output); - } -} -} // namespace - -void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Destination auto inizialitation if not yet initialized - const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst)); - - // Note: This kernel performs 16 elements per iteration. - // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory - // For this reason num_elems_processed_per_iteration_x is set to 1 - const unsigned int num_elems_processed_per_iteration_x = 1; - const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size()); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped - Coordinates coord; - coord.set_num_dimensions(dst->num_dimensions()); - dst->set_valid_region(ValidRegion(coord, dst->tensor_shape())); - - ICpuKernel::configure(win); -} - -Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - - // Error if input is not 8 bit, 16bit or 32bit - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->element_size() != 1 && src->element_size() != 2 && src->element_size() != 4, - "Element size not supported"); - - // Validate configured destination - if(dst->total_size() != 0) - { - const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - return Status{}; -} - -void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - switch(src->info()->element_size()) - { - case 1: - transpose_8bit_elements(src, dst, window); - break; - case 2: - transpose_16bit_elements(src, dst, window); - break; - case 4: - transpose_32bit_elements(src, dst, window); - break; - default: - ARM_COMPUTE_ERROR("Element size not supported"); - break; - } -} - -const char *CpuTransposeKernel::name() const -{ - return "CpuTransposeKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuTransposeKernel.h b/src/core/cpu/kernels/CpuTransposeKernel.h deleted file mode 100644 index 920349d5e7..0000000000 --- a/src/core/cpu/kernels/CpuTransposeKernel.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H -#define ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel which transposes the elements of a matrix */ -class CpuTransposeKernel : public ICpuKernel -{ -public: - CpuTransposeKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuTransposeKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] src Srouce tensor to permute. Data types supported: All - * @param[out] dst Destination tensor. Data types supported: Same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuTransposeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp deleted file mode 100644 index 79f058944d..0000000000 --- a/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuWeightsReshapeKernel.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -TensorShape get_output_shape(const ITensorInfo *src, bool has_bias) -{ - TensorShape output_shape{ src->tensor_shape() }; - - output_shape.collapse(3); - const size_t tmp_dim = output_shape[0]; - output_shape.set(0, output_shape[1]); - output_shape.set(1, tmp_dim + (has_bias ? 1 : 0)); - - return output_shape; -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); - ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1)); - ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2)); - ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3])); - ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4])); - } - - // Checks performed when output is configured - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - } - - return Status{}; -} -} // namespace - -void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr)))); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - biases, - dst)); - - // Configure kernel - Window window = calculate_max_window(*src, Steps()); - window.set(Window::DimX, Window::Dimension(0, src->dimension(0), src->dimension(0))); - window.set(Window::DimY, Window::Dimension(0, src->dimension(1), src->dimension(1))); - window.set(Window::DimZ, Window::Dimension(0, src->dimension(2), src->dimension(2))); - ICpuKernel::configure(window); -} - -Status CpuWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst)); - return Status{}; -} - -void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto biases = tensors.get_const_tensor(TensorType::ACL_BIAS); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - const unsigned int kernel_size_x = src->info()->dimension(0); - const unsigned int kernel_size_y = src->info()->dimension(1); - const unsigned int kernel_depth = src->info()->dimension(2); - const unsigned int input_stride_x = src->info()->strides_in_bytes().x(); - const unsigned int input_stride_y = src->info()->strides_in_bytes().y(); - const unsigned int input_stride_z = src->info()->strides_in_bytes().z(); - const unsigned int output_stride_y = dst->info()->strides_in_bytes().y(); - - // Create iterators - Iterator in(src, window); - execute_window_loop(window, [&](const Coordinates & id) - { - // Get column index - const int kernel_idx = id[3]; - const int kernel_idz = id[4]; - - // Setup pointers - const uint8_t *tmp_input_ptr = in.ptr(); - uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz)); - const uint8_t *curr_input_row_ptr = tmp_input_ptr; - const uint8_t *curr_input_depth_ptr = tmp_input_ptr; - - // Linearize volume - for(unsigned int d = 0; d < kernel_depth; ++d) - { - for(unsigned int j = 0; j < kernel_size_y; ++j) - { - for(unsigned int i = 0; i < kernel_size_x; ++i) - { - std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size()); - tmp_input_ptr += input_stride_x; - tmp_output_ptr += output_stride_y; - } - curr_input_row_ptr += input_stride_y; - tmp_input_ptr = curr_input_row_ptr; - } - curr_input_depth_ptr += input_stride_z; - curr_input_row_ptr = curr_input_depth_ptr; - tmp_input_ptr = curr_input_depth_ptr; - } - - // Add bias - if(biases != nullptr) - { - std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size()); - } - }, - in); -} -const char *CpuWeightsReshapeKernel::name() const -{ - return "CpuWeightsReshapeKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/CpuWeightsReshapeKernel.h b/src/core/cpu/kernels/CpuWeightsReshapeKernel.h deleted file mode 100644 index eea150a96e..0000000000 --- a/src/core/cpu/kernels/CpuWeightsReshapeKernel.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H -#define ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Kernel to perform reshaping on the weights used by convolution and locally connected layer - * - * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels. - * In combination with the @ref cpu::kernels::CpuIm2ColKernel can transform a convolution to a matrix multiplication. - * - * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have: - * @f[ - * \left( \begin{array}{ccc} - * a000 & a001 & a002 \\ - * a010 & a011 & a012 \\ - * a020 & a021 & a022 \\ - * \end{array} \right) - * \left( \begin{array}{ccc} - * a100 & a101 & a102 \\ - * a110 & a111 & a112 \\ - * a120 & a121 & a122 \\ - * \end{array} \right) - * \rightarrow - * \left( \begin{array}{ccccccccc} - * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\ - * \end{array} \right) - * @f] - */ -class CpuWeightsReshapeKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuWeightsReshapeKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWeightsReshapeKernel); - /** Set the input and output of the kernel. - * - * @param[in] src The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. - * Data types supported: All - * @param[in] biases The shared biases tensor info to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with - * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input - * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. - * @param[out] dst The output tensor info. Data types supported: Same as @p src - */ - void configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuWeightsReshapeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp deleted file mode 100644 index 9456f96354..0000000000 --- a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp +++ /dev/null @@ -1,551 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuWinogradConv2dKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/kernels/convolution/common/utils.hpp" -#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -//Batched Gemms - -namespace -{ -inline bool is_kernel_size_supported(DataType data_type, Size2D size) -{ - const std::array f32_support = { { Size2D(1, 3), Size2D(3, 1), Size2D(5, 5), Size2D(3, 3), Size2D(1, 5), Size2D(5, 1), Size2D(7, 1), Size2D(1, 7) } }; - const std::array f16_support = { { Size2D(3, 3) } }; - - switch(data_type) - { - case DataType::F16: - return std::end(f16_support) != std::find(std::begin(f16_support), std::end(f16_support), size); - case DataType::F32: - return std::end(f32_support) != std::find(std::begin(f32_support), std::end(f32_support), size); - default: - return false; - } -} - -Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - - const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const auto input_width = input->dimension(idx_width); - const auto input_height = input->dimension(idx_height); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(input_width, input_height)), - "Only 1x3, 3x1, 1x5, 5x1, 7x1, 1x7, 3x3 and 5x5 kernels are supported"); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - const Size2D &output_tile = winograd_info.output_tile_size; - const std::array supported_tile_sizes = { { Size2D(2U, 2U), Size2D(4U, 4U), Size2D(1U, 6U), Size2D(6U, 1U), Size2D(4, 1), Size2D(1, 4), Size2D(2, 1), Size2D(1, 2) } }; - ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_tile_sizes) == std::find(std::begin(supported_tile_sizes), std::end(supported_tile_sizes), output_tile)); - - // Checks performed when output is configured - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info)); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} - -std::pair validate_and_configure_window_winograd_weight_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) -{ - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info))); - const Window win = calculate_max_window(*input, Steps(), true /* skip border*/); - return std::make_pair(Status{}, win); -} - -Status validate_arguments_winograd_input_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) -{ - const Size2D &kernel_dims = winograd_info.kernel_size; - const PadStrideInfo &conv_info = winograd_info.convolution_info; - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(kernel_dims.width, kernel_dims.height)), - "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported"); - - // Validate configured output - if(output->total_size() != 0) - { - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} - -std::pair validate_and_configure_window_winograd_input_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) -{ - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); - return std::make_pair(Status{}, calculate_max_window(*input, Steps(), true)); -} - -Status validate_arguments_winograd_output_trans(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info) -{ - const PadStrideInfo &conv_info = winograd_info.convolution_info; - const Size2D kernel_dims = winograd_info.kernel_size; - - // Number of tiles along the X and Y direction - const unsigned int num_tiles_x = std::ceil((winograd_info.input_dimensions.x() - (kernel_dims.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast - (winograd_info.output_tile_size.width)); - const unsigned int num_tiles_y = std::ceil((winograd_info.input_dimensions.y() - (kernel_dims.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast - (winograd_info.output_tile_size.height)); - const Size2D num_tiles = Size2D(num_tiles_x, num_tiles_y); - - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(kernel_dims.width, kernel_dims.height)), - "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported"); - - const std::array supported_gemm_sizes = { { 8U, 16U, 36U } }; - ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_gemm_sizes) == std::find(std::begin(supported_gemm_sizes), std::end(supported_gemm_sizes), input->dimension(2))); - ARM_COMPUTE_UNUSED(kernel_dims); - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != size_t(1)); - } - - // Checks performed when output is configured - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - return Status{}; -} - -std::pair validate_and_configure_window_winograd_output_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) -{ - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info))); - - return std::make_pair(Status{}, calculate_max_window(*input, Steps(), true)); -} -} // namespace - -Status ICpuWinogradConv2dTransformWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *weights) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - const DataLayout data_layout = input->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(weights->dimension(width_idx), weights->dimension(height_idx))), - "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported"); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - return Status{}; -} - -template -unsigned int CpuWinogradConv2dTransformWeightsKernel::get_weight_storage_size(int num_output_channels, int num_input_channels) const -{ - const KernelShape shape(num_output_channels, KernelRows, KernelCols, num_input_channels); - // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T - return static_cast(WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels) / sizeof(T)); -} - -template -CpuWinogradConv2dTransformWeightsKernel::CpuWinogradConv2dTransformWeightsKernel() - : _transform(nullptr), _num_output_channels(0), _matrix_stride(0) -{ -} - -template -int CpuWinogradConv2dTransformWeightsKernel::get_matrix_stride(int num_output_channels, int num_input_channels) const -{ - return WinogradConv::get_kernel_matrix_stride(num_input_channels, num_output_channels); -} - -#ifndef DOXYGEN_SKIP_THIS -template -void CpuWinogradConv2dTransformWeightsKernel::configure( - const ITensorInfo *weights_hwio, - ITensorInfo *output, - const int matrix_stride, /** Stride across matrices in the output. */ - const int num_output_channels, /** Number of filters. */ - const int num_input_channels) /** Number of channels in each filter. */ -{ - ARM_COMPUTE_UNUSED(weights_hwio, output); - - _transform = std::make_unique(num_output_channels, num_input_channels); - _num_output_channels = num_output_channels; - _matrix_stride = matrix_stride; - - Window win; - auto win_last = _transform->get_window(); - win.set(Window::DimX, Window::Dimension(0, win_last, 1)); - ICpuKernel::configure(win); -} -#endif /* DOXYGEN_SKIP_THIS */ - -template -void CpuWinogradConv2dTransformWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON(tensors.empty()); - - const size_t fst = window.x().start(); - const size_t lst = window.x().end(); - - const ITensor *weights_hwio = tensors.get_const_tensor(TensorType::ACL_SRC); - ITensor *output = tensors.get_tensor(TensorType::ACL_DST); - - _transform->set_weight_tensor(weights_hwio->buffer()); - const int matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK); - _transform->set_output_matrices(output->buffer(), _matrix_stride, matrix_row_stride); - _transform->set_working_space(output->buffer()); - - _transform->run(fst, lst); -} - -template -bool CpuWinogradConv2dTransformWeightsKernel::is_parallelisable() const -{ - return false; -} - -template -Status CpuWinogradConv2dTransformWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_weight_trans(input, output, winograd_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_weight_trans(input->clone().get(), output->clone().get(), winograd_info).first); - return Status{}; -} - -template class CpuWinogradConv2dTransformWeightsKernel; -template class CpuWinogradConv2dTransformWeightsKernel; -template class CpuWinogradConv2dTransformWeightsKernel; -template class CpuWinogradConv2dTransformWeightsKernel; -template class CpuWinogradConv2dTransformWeightsKernel; - -template class CpuWinogradConv2dTransformWeightsKernel; -template class CpuWinogradConv2dTransformWeightsKernel; -template class CpuWinogradConv2dTransformWeightsKernel; -template class CpuWinogradConv2dTransformWeightsKernel; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class CpuWinogradConv2dTransformWeightsKernel<__fp16, 4, 4, 3, 3>; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -// Input transform - -template -unsigned int CpuWinogradConv2dTransformInputKernel::get_input_storage_size( - int num_batches, /* Number of batches in the input tensor. */ - int num_channels, /* Number of feature maps in the input tensor. */ - int num_rows, /* Number of rows in each feature map. */ - int num_cols, /* Number of columns in each feature map. */ - bool same_padding /* Use "SAME" padding, otherwise use "VALID". */ -) const -{ - // Construct shapes for the input and kernel tensors. - const Tensor4DShape input_shape(num_batches, num_rows, num_cols, num_channels); - const KernelShape kern_shape(1, KernelRows, KernelCols, num_channels); - // Return the size, converted into units of TIn - return static_cast(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding) / sizeof(T)); -} - -template -unsigned int CpuWinogradConv2dTransformInputKernel::get_working_space_size(unsigned int num_threads) const -{ - return _transform->get_working_space_size(num_threads); -} - -template -int CpuWinogradConv2dTransformInputKernel::get_matrix_stride( - int num_batches, /* Number of batches in the input tensor. */ - int num_channels, /* Number of feature maps in the input tensor. */ - int num_rows, /* Number of rows in each feature map. */ - int num_cols, /* Number of columns in each feature map. */ - bool same_padding /* Use "SAME" padding, otherwise use "VALID". */) const -{ - return WinogradConv::get_input_matrix_stride(num_batches, num_rows, num_cols, num_channels, same_padding); -} - -template -CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel() - : _transform(nullptr), _num_channels(0), _matrix_stride(0) -{ -} - -template -void CpuWinogradConv2dTransformInputKernel::configure( - const ITensorInfo *input_nhwc, - const int num_batches, /* Number of batches in input tensor. */ - const int num_rows, /* Number of rows in input tensor. */ - const int num_cols, /* Number of columns in input tensor. */ - const int num_channels, /* Number of channels in input tensor. */ - const PaddingType padding, /* Padding type. */ - ITensorInfo *output, /* Base of output matrices. */ - const int matrix_stride, /* Stride between output matrices. */ - ITensorInfo *workspace) -{ - ARM_COMPUTE_UNUSED(input_nhwc, output, matrix_stride, workspace); - - _num_channels = num_channels; - _matrix_stride = matrix_stride; - - const int padding_top = (padding == PADDING_SAME) ? (KernelRows - 1) / 2 : 0; - const int padding_left = (padding == PADDING_SAME) ? (KernelCols - 1) / 2 : 0; - const int padding_bottom = (padding == PADDING_SAME) ? iceildiv(KernelRows - 1, 2) : 0; - const int padding_right = (padding == PADDING_SAME) ? iceildiv(KernelCols - 1, 2) : 0; - - _transform = std::make_unique( - KernelRows, - KernelCols, - num_batches, - num_rows, - num_cols, - num_channels, - padding_top, /**< Padding to apply to the top of the image. */ - padding_left, /**< Padding to apply to the left of the image. */ - padding_bottom, /**< Padding to apply to the bottom of the image. */ - padding_right /**< Padding to apply to the right of the image. */ - ); - - Window win; - auto win_last = _transform->get_window(); - win.set(Window::DimX, Window::Dimension(0, win_last, 1)); - ICpuKernel::configure(win); -} - -template -void CpuWinogradConv2dTransformInputKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON(tensors.empty()); - - const ITensor *input_nhwc = tensors.get_const_tensor(TensorType::ACL_SRC); - const ITensor *workspace = tensors.get_const_tensor(TensorType::ACL_INT); - ITensor *output = tensors.get_tensor(TensorType::ACL_DST); - - const int element_size_in_bytes = input_nhwc->info()->element_size(); - const int input_col_stride = input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes; - const int input_row_stride = input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes; - const int input_batch_stride = input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes; - const auto input_nhwc_ptr = reinterpret_cast(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes()); - auto output_ptr = reinterpret_cast(output->buffer() + output->info()->offset_first_element_in_bytes()); - ARM_COMPUTE_ERROR_ON_NULLPTR(output_ptr); - - _transform->set_input_tensor(input_nhwc_ptr, input_batch_stride, input_row_stride, input_col_stride); - _transform->set_output_matrices(output_ptr, _matrix_stride, _num_channels); - - _transform->set_working_space(workspace->buffer()); - - // The code below cannot be moved to configure because biases hasn't been allocated at that point - const size_t fst = window.x().start(); - const size_t lst = window.x().end(); - _transform->run(fst, lst, info.thread_id); -} - -template -Status CpuWinogradConv2dTransformInputKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_input_trans(input, output, winograd_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_input_trans(input->clone().get(), output->clone().get(), winograd_info).first); - - return Status{}; -} - -template class CpuWinogradConv2dTransformInputKernel; -template class CpuWinogradConv2dTransformInputKernel; -template class CpuWinogradConv2dTransformInputKernel; -template class CpuWinogradConv2dTransformInputKernel; -template class CpuWinogradConv2dTransformInputKernel; - -template class CpuWinogradConv2dTransformInputKernel; -template class CpuWinogradConv2dTransformInputKernel; -template class CpuWinogradConv2dTransformInputKernel; -template class CpuWinogradConv2dTransformInputKernel; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class CpuWinogradConv2dTransformInputKernel<__fp16, 4, 4, 3, 3>; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -// Output transform - -template -unsigned int CpuWinogradConv2dTransformOutputKernel::get_output_storage_size( - int num_batches, /* Number of batches in the output tensor. */ - int num_rows, /* Number of rows in each feature map of the input tensor. */ - int num_cols, /* Number of columns in each feature map of the input tensor. */ - int num_output_channels /* Number of feature maps in the output tensor. */ -) const -{ - // Construct shapes for the input and kernel tensors. - const Tensor4DShape input_shape(num_batches, num_rows, num_cols, 1); - const KernelShape kern_shape(num_output_channels, KernelRows, KernelCols, 1); - // Return the size, converted into units of TOut - return static_cast( - WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels) / sizeof(T)); -} - -template -CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel() - : _transform(nullptr), _matrix_stride(0), _matrix_row_stride(0) -{ -} - -template -unsigned int CpuWinogradConv2dTransformOutputKernel::get_working_space_size(unsigned int num_threads) const -{ - return _transform->get_working_space_size(num_threads); -} - -template -int CpuWinogradConv2dTransformOutputKernel::get_matrix_stride( - int num_batches, /* Number of batches in the output tensor. */ - int num_rows, /* Number of rows in each feature map of the input tensor. */ - int num_cols, /* Number of columns in each feature map of the input tensor. */ - int num_output_channels /* Number of feature maps in the output tensor. */ -) const -{ - return WinogradConv::get_output_matrix_stride(num_batches, num_rows, num_cols, num_output_channels); -} - -template -std::pair CpuWinogradConv2dTransformOutputKernel::get_output_shape( - int num_rows, /* Number of rows in each feature map of the input tensor. */ - int num_cols, /* Number of columns in each feature map of the input tensor. */ - bool padding_same) const -{ - return WinogradConv::get_output_shape(std::make_pair(num_rows, num_cols), padding_same); -} - -template -void CpuWinogradConv2dTransformOutputKernel::configure( - const ITensorInfo *biases, - const ITensorInfo *transformed_output, - const int matrix_stride, - ITensorInfo *output_nhwc, - const int num_batches, - const int num_rows, - const int num_cols, - const int num_channels, - ITensorInfo *workspace, - const arm_gemm::Activation &activation) -{ - ARM_COMPUTE_UNUSED(biases, transformed_output, output_nhwc, num_batches, num_rows, num_cols, workspace, activation); - - _matrix_stride = matrix_stride; - _matrix_row_stride = roundup(num_channels, WinogradConv::N_BLOCK); - - // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window - _transform = std::make_unique(num_batches, num_rows, num_cols, num_channels, activation); - Window win; - auto win_last = _transform->get_window(); - win.set(Window::DimX, Window::Dimension(0, win_last, 1)); - - ICpuKernel::configure(win); -} - -template -void CpuWinogradConv2dTransformOutputKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON(tensors.empty()); - - const ITensor *biases = tensors.get_const_tensor(TensorType::ACL_SRC_0); - const ITensor *transformed_output = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT); - ITensor *dst_nhwc = tensors.get_tensor(TensorType::ACL_DST); - - const int out_batch_stride = dst_nhwc->info()->strides_in_bytes()[3] / sizeof(T); - const int out_row_stride = dst_nhwc->info()->strides_in_bytes()[2] / sizeof(T); - const int out_col_stride = dst_nhwc->info()->strides_in_bytes()[1] / sizeof(T); - - _transform->set_input_matrices(transformed_output->buffer(), _matrix_stride, _matrix_row_stride); - _transform->set_bias((biases ? reinterpret_cast(biases->buffer() + biases->info()->offset_first_element_in_bytes()) : nullptr)); - _transform->set_output_tensor(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes(), out_batch_stride, out_row_stride, out_col_stride); - _transform->set_working_space(workspace->buffer()); - - // The code below cannot be moved to configure because biases hasn't been allocated at that point - const size_t fst = window.x().start(); - const size_t lst = window.x().end(); - _transform->run(fst, lst, info.thread_id); -} - -template -Status CpuWinogradConv2dTransformOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, - const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_output_trans(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_output_trans(input->clone().get(), output->clone().get(), winograd_info).first); - - return Status{}; -} - -template class CpuWinogradConv2dTransformOutputKernel; -template class CpuWinogradConv2dTransformOutputKernel; -template class CpuWinogradConv2dTransformOutputKernel; -template class CpuWinogradConv2dTransformOutputKernel; -template class CpuWinogradConv2dTransformOutputKernel; - -template class CpuWinogradConv2dTransformOutputKernel; -template class CpuWinogradConv2dTransformOutputKernel; -template class CpuWinogradConv2dTransformOutputKernel; -template class CpuWinogradConv2dTransformOutputKernel; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class CpuWinogradConv2dTransformOutputKernel<__fp16, 4, 4, 3, 3>; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuWinogradConv2dKernel.h b/src/core/cpu/kernels/CpuWinogradConv2dKernel.h deleted file mode 100644 index b5a29ffd02..0000000000 --- a/src/core/cpu/kernels/CpuWinogradConv2dKernel.h +++ /dev/null @@ -1,575 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H -#define ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H - -#include "src/core/NEON/kernels/convolution/common/convolution.hpp" -#include "src/core/NEON/kernels/convolution/common/tensor.hpp" -#include "src/core/cpu/ICpuKernel.h" - -#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp" - -namespace arm_compute -{ -namespace cpu -{ -/** Interface for the kernel to perform Winograd input transform. */ -class ICpuWinogradConv2dTransformInputKernel : public ICpuKernel -{ -public: - /** Get the working space required to perform the transformation. - * - * Note, the working space is only required when performing the - * transformation - hence it can be reused whenever the transformation is - * not running. - * - * @param num_threads The greatest number of threads that will be used to execute the transform. - * @return Size of working space required in bytes. - */ - virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0; - - /** Determine how much memory (in units of TIn) to allocate for the - * transformed input. - * - * @param[in] num_batches Number of batches in the input tensor. - * @param[in] num_channels Number of feature maps in the input tensor. - * @param[in] num_rows Number of rows in each feature map. - * @param[in] num_cols Number of columns in each feature map. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". - * - * @return Storage size (in units of TIn) required. - */ - virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0; - - /** Gets the stride between matrices in the input worspace - * - * @param[in] num_batches Number of batches in the input tensor. - * @param[in] num_channels Number of feature maps in the input tensor. - * @param[in] num_rows Number of rows in each feature map. - * @param[in] num_cols Number of columns in each feature map. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". - * - * @return Stride expressed in bytes. - */ - virtual int get_matrix_stride(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0; - - /** Configure the output transform kernel. - * - * @param[in] input_nhwc Input tensor in NHWC data layout format. - * @param[in] num_batches Number of batches in input tensor. - * @param[in] num_rows Number of rows in input tensor. - * @param[in] num_cols Number of columns in input tensor. - * @param[in] num_channels Number of channels in input tensor. - * @param[in] padding Padding type. - * @param[out] output Base of output matrices. - * @param[in] matrix_stride Stride between output matrices. - * @param[in] workspace Tensor to be used as the working space during the computation. - */ - virtual void configure(const ITensorInfo *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels, - const PaddingType padding, ITensorInfo *output, const int matrix_stride, ITensorInfo *workspace) = 0; - - /** Destructor */ - virtual ~ICpuWinogradConv2dTransformInputKernel() - { - } -}; - -/** Kernel to perform Winograd input transform. */ -template -class CpuWinogradConv2dTransformInputKernel : public ICpuWinogradConv2dTransformInputKernel -{ -public: - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuWinogradConv2dTransformInputKernel(const CpuWinogradConv2dTransformInputKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuWinogradConv2dTransformInputKernel &operator=(const CpuWinogradConv2dTransformInputKernel &) = delete; - /** Allow instances of this class to be moved */ - CpuWinogradConv2dTransformInputKernel(CpuWinogradConv2dTransformInputKernel &&) = default; - /** Allow instances of this class to be moved */ - CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = default; - /** Default destructor */ - ~CpuWinogradConv2dTransformInputKernel() = default; - - /** Determine how much memory (in units of TIn) to allocate for the - * transformed input. - * - * @param[in] num_batches Number of batches in the input tensor. - * @param[in] num_channels Number of feature maps in the input tensor. - * @param[in] num_rows Number of rows in each feature map. - * @param[in] num_cols Number of columns in each feature map. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". - * - * @return Storage size (in units of TIn) required. - */ - unsigned int get_input_storage_size( - int num_batches, - int num_channels, - int num_rows, - int num_cols, - bool same_padding) const override; - - /** Get the working space required to perform the transformation. - * - * Note, the working space is only required when performing the - * transformation - hence it can be reused whenever the transformation is - * not running. - * - * @param[in] num_threads The greatest number of threads that will be used to execute the transform. - * - * @return Size of working space required in bytes. - */ - unsigned int get_working_space_size(unsigned int num_threads) const override; - - /** Gets the stride between matrices in the input worspace - * - * @param[in] num_batches Number of batches in the input tensor. - * @param[in] num_channels Number of feature maps in the input tensor. - * @param[in] num_rows Number of rows in each feature map. - * @param[in] num_cols Number of columns in each feature map. - * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". - * - * @return Stride expressed in bytes. - */ - int get_matrix_stride( - int num_batches, - int num_channels, - int num_rows, - int num_cols, - bool same_padding) const override; - - /** Default constructor */ - CpuWinogradConv2dTransformInputKernel(); - - const char *name() const override - { - return "CpuWinogradConv2dTransformInputKernel"; - } - - /** Configure the output transform kernel. - * - * @param[in] input_nhwc Input tensor. Data types supported: F16/F32. Layout supported NHWC. - * @param[in] num_batches Number of batches in input tensor. - * @param[in] num_rows Number of rows in input tensor. - * @param[in] num_cols Number of columns in input tensor. - * @param[in] num_channels Number of channels in input tensor. - * @param[in] padding Padding type. - * @param[out] output Base of output matrices. - * @param[in] matrix_stride Stride between output matrices. - * @param[in] workspace Tensor to be used as the working space during the computation. - */ - void configure( - const ITensorInfo *input_nhwc, - const int num_batches, - const int num_rows, - const int num_cols, - const int num_channels, - const PaddingType padding, - ITensorInfo *output, - const int matrix_stride, - ITensorInfo *workspace) override; - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - - /** Winograd base kernel */ - using WinogradBase = winograd::WinogradGEMM; - /** Winograd convolution kernel */ - using WinogradConv = typename WinogradBase::template Convolution; - - /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformInputKernel - * - * @param[in] input First tensor input info. Data types supported: F16/F32. - * @param[in] output Output tensor info. Data types supported: same as @p input. - * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info); - -private: - using InputTransform = typename WinogradBase::template InputTransform; - - std::unique_ptr _transform{ nullptr }; - int _num_channels; /**< Number of channels in input tensor. */ - int _matrix_stride; /**< Stride between output matrices. */ -}; - -/** Interface for the kernel to perform Winograd output transform. */ -class ICpuWinogradConv2dTransformOutputKernel : public ICpuKernel -{ -public: - /** Get the working space required to perform the transformation. - * - * Note, the working space is only required when performing the - * transformation - hence it can be reused whenever the transformation is - * not running. - * - * @param[in] num_threads The greatest number of threads that will be used to execute the transform. - * - * @return Size of working space required in bytes. - */ - virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0; - - /** Determine how much memory (in units of TOut) to allocate for the - * (Winograd domain) output. - * - * @param[in] num_batches Number of batches in the output tensor. - * @param[in] num_rows Number of rows in each feature map of the input tensor. - * @param[in] num_cols Number of columns in each feature map of the input tensor. - * @param[in] num_output_channels Number of feature maps in the output tensor. - * - * @return Storage size (in units of TOut) required. - */ - virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0; - - /** Gets the stride between matrices in the output worspace - * - * @param[in] num_batches Number of batches in the output tensor. - * @param[in] num_rows Number of rows in each feature map of the input tensor. - * @param[in] num_cols Number of columns in each feature map of the input tensor. - * @param[in] num_output_channels Number of feature maps in the output tensor. - * - * @return Stride expressed in bytes. - */ - virtual int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0; - - /** Get the output shape of a convolution. - * - * @param[in] num_rows Number of rows in each feature map of the input tensor. - * @param[in] num_cols Number of columns in each feature map of the input tensor. - * @param[in] padding_same True if padding is SAME, false otherwise - * - * @return Shape of the output tensor - */ - virtual std::pair get_output_shape( - int num_rows, /* Number of rows in each feature map of the input tensor. */ - int num_cols, /* Number of columns in each feature map of the input tensor. */ - bool padding_same /* True if padding is SAME, false otherwise */ - ) const = 0; - - /** Configure the output transform kernel. - * - * @param[in] biases Pointer to the biases tensor. - * @param[in] transformed_output Pointer to working space for the output tensor in the Winograd domain. - * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution::get_output_matrix_stride() - * @param[out] output_nhwc Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain. - * @param[in] num_batches Number of batches in the input tensor. - * @param[in] num_rows Number of rows in output tensor. - * @param[in] num_cols Number of columns in output tensor. - * @param[in] num_channels Number of feature maps in the output tensor. - * @param[in] workspace Tensor to be used as the working space during the computation. - * @param[in] activation Activation to be used - */ - virtual void configure( - const ITensorInfo *biases, - const ITensorInfo *transformed_output, - const int matrix_stride, - ITensorInfo *output_nhwc, - const int num_batches, - const int num_rows, - const int num_cols, - const int num_channels, - ITensorInfo *workspace, - const arm_gemm::Activation &activation) = 0; - - virtual ~ICpuWinogradConv2dTransformOutputKernel() - { - } -}; - -/** Kernel to perform Winograd output transform. */ -template -class CpuWinogradConv2dTransformOutputKernel : public ICpuWinogradConv2dTransformOutputKernel -{ -public: - const char *name() const override - { - return "CpuWinogradConv2dTransformOutputKernel"; - } - /** Constructor */ - CpuWinogradConv2dTransformOutputKernel(); - - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuWinogradConv2dTransformOutputKernel(const CpuWinogradConv2dTransformOutputKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuWinogradConv2dTransformOutputKernel &operator=(const CpuWinogradConv2dTransformOutputKernel &) = delete; - /** Allow instances of this class to be moved */ - CpuWinogradConv2dTransformOutputKernel(CpuWinogradConv2dTransformOutputKernel &&) = default; - /** Allow instances of this class to be moved */ - CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = default; - /** Default destructor */ - ~CpuWinogradConv2dTransformOutputKernel() = default; - - // Inherited methods overridden: - /** Determine how much memory (in units of TOut) to allocate for the - * (Winograd domain) output. - * - * @param[in] num_batches Number of batches in the output tensor. - * @param[in] num_rows Number of rows in each feature map of the input tensor. - * @param[in] num_cols Number of columns in each feature map of the input tensor. - * @param[in] num_output_channels Number of feature maps in the output tensor. - * - * @return Storage size (in units of TOut) required. - */ - unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const override; - - /** Gets the stride between matrices in the output worspace - * - * @param[in] num_batches Number of batches in the output tensor. - * @param[in] num_rows Number of rows in each feature map of the input tensor. - * @param[in] num_cols Number of columns in each feature map of the input tensor. - * @param[in] num_output_channels Number of feature maps in the output tensor. - * - * @return Stride expressed in bytes. - */ - int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const override; - /** Get the output shape of a convolution. - * - * @param[in] num_rows Number of rows in each feature map of the input tensor. - * @param[in] num_cols Number of columns in each feature map of the input tensor. - * @param[in] padding_same True if padding is SAME, false otherwise - * - * @return Shape of the output tensor - */ - std::pair get_output_shape( - int num_rows, /* Number of rows in each feature map of the input tensor. */ - int num_cols, /* Number of columns in each feature map of the input tensor. */ - bool padding_same) const override; - - /** Get the working space required to perform the transformation. - * - * Note, the working space is only required when performing the - * transformation - hence it can be reused whenever the transformation is - * not running. - * - * @param[in] num_threads The greatest number of threads that will be used to execute the transform. - * - * @return Size of working space required in bytes. - */ - unsigned int get_working_space_size(unsigned int num_threads) const override; - - /** Configure the output transform kernel. - * - * @param[in] biases Pointer to the biases tensor. - * @param[in] transformed_output Pointer to working space for the output tensor in the Winograd domain. - * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution::get_output_matrix_stride() - * @param[out] output_nhwc Pointer to a tensor with NHWC data layout, in the spatial domain. - * @param[in] num_batches Number of batches in the input tensor. - * @param[in] num_rows Number of rows in output tensor. - * @param[in] num_cols Number of columns in output tensor. - * @param[in] num_channels Number of feature maps in the output tensor. - * @param[in] workspace Tensor to be used as the working space during the computation. - * @param[in] activation Activation to be used - */ - void configure( - const ITensorInfo *biases, - const ITensorInfo *transformed_output, - const int matrix_stride, - ITensorInfo *output_nhwc, - const int num_batches, - const int num_rows, - const int num_cols, - const int num_channels, - ITensorInfo *workspace, - const arm_gemm::Activation &activation) override; - - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - - /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformOutputKernel - * - * @param[in] input Source tensor info with shape [C, N, 16, batches] or [C, N, 36, batches]. Data types supported: F16/F32. - * @param[in] bias Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input - * @param[in] output Destination tensor info with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input - * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info); - -private: - using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using OutputTransform = typename WinogradBase::template OutputTransform; - - std::unique_ptr _transform{ nullptr }; - int _matrix_stride; - int _matrix_row_stride; -}; - -/** Interface for the kernel to perform Winograd weights transform. */ -class ICpuWinogradConv2dTransformWeightsKernel : public ICpuKernel -{ -public: - /** Prevent instances of this class from being copied (As this class contains pointers) */ - ICpuWinogradConv2dTransformWeightsKernel(const ICpuWinogradConv2dTransformWeightsKernel &) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - ICpuWinogradConv2dTransformWeightsKernel &operator=(const ICpuWinogradConv2dTransformWeightsKernel &) = default; - /** Allow instances of this class to be moved */ - ICpuWinogradConv2dTransformWeightsKernel(ICpuWinogradConv2dTransformWeightsKernel &&) = default; - /** Allow instances of this class to be moved */ - ICpuWinogradConv2dTransformWeightsKernel &operator=(ICpuWinogradConv2dTransformWeightsKernel &&) = default; - - ICpuWinogradConv2dTransformWeightsKernel() - { - } - virtual ~ICpuWinogradConv2dTransformWeightsKernel() - { - } - /** Determine how much memory (in units of T) to allocate for the - * transformed weights. - * - * @param[in] num_output_channels Number of output feature maps. - * @param[in] num_input_channels Number of input feature maps. - * - * @return Storage size (in units of T) required. - */ - virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0; - /** Gets the stride between matrices in the kernel worspace - * - * @param[in] num_output_channels Number of output feature maps. - * @param[in] num_input_channels Number of input feature maps. - * - * @return Stride expressed in bytes. - */ - virtual int get_matrix_stride(int num_output_channels, int num_input_channels) const = 0; - - /** Configure the weights transform kernel. - * - * @param[in] weights_hwio Pointer to the weights tensor info - * @param[out] output Pointer to working space for the output tensor in the Winograd domain. - * @param[in] matrix_stride Stride across matrices in the output workspace. - * @param[in] num_output_channels Number of filters. - * @param[in] num_input_channels Number of channels in each filter. - */ - - virtual void configure(const ITensorInfo *weights_hwio, ITensorInfo *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0; - - /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformWeightsKernel - * - * @param[in] input First tensor input info. Data types supported: F16/F32. - * @param[in] weights Weights tensor info. Data types supported: same as @p input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights); -}; - -/** Kernel to perform Winograd weights transform. */ -template -class CpuWinogradConv2dTransformWeightsKernel final : public ICpuWinogradConv2dTransformWeightsKernel -{ -public: - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuWinogradConv2dTransformWeightsKernel(const CpuWinogradConv2dTransformWeightsKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuWinogradConv2dTransformWeightsKernel &operator=(const CpuWinogradConv2dTransformWeightsKernel &) = delete; - /** Allow instances of this class to be moved */ - CpuWinogradConv2dTransformWeightsKernel(CpuWinogradConv2dTransformWeightsKernel &&) = default; - /** Allow instances of this class to be moved */ - CpuWinogradConv2dTransformWeightsKernel &operator=(CpuWinogradConv2dTransformWeightsKernel &&) = default; - /** Default destructor */ - ~CpuWinogradConv2dTransformWeightsKernel() = default; - - /** Default constructor. */ - CpuWinogradConv2dTransformWeightsKernel(); - const char *name() const override - { - return "CpuWinogradConv2dTransformWeightsKernel"; - } - - /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformWeightsKernel - * - * @param[in] input Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout). - * kernel_x must be 3 and equal to kernel_y. Data types supported: F16/F32. - * @param[in] output Destination tensor info. The output is a 3D tensor with dimensions [OFM, IFM, 16] or [OFM, IFM, 36]. Data type supported: same as @p input - * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info); - - // Inherited methods overridden: - -#ifndef DOXYGEN_SKIP_THIS - /** Configure the weights transform kernel. - * - * @param[in] weights_hwio Pointer to the weights tensor info - * @param[out] output Pointer to working space for the output tensor in the Winograd domain. - * @param[in] matrix_stride Stride across matrices in the output workspace. - * @param[in] num_output_channels Number of filters. - * @param[in] num_input_channels Number of channels in each filter. - */ - void configure(const ITensorInfo *weights_hwio, ITensorInfo *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override; -#endif /* DOXYGEN_SKIP_THIS */ - - /** Determine how much memory (in units of T) to allocate for the - * transformed weights. - * - * @param[in] num_output_channels Number of output feature maps. - * @param[in] num_input_channels Number of input feature maps. - * - * @return Storage size (in units of T) required. - */ - unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override; - - /** Gets the stride between matrices in the input worspace - * - * @param[in] num_output_channels Number of output feature maps. - * @param[in] num_input_channels Number of input feature maps. - * - * @return Stride expressed in bytes. - */ - int get_matrix_stride(int num_output_channels, int num_input_channels) const override; - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - bool is_parallelisable() const override; - -private: - using WinogradBase = winograd::WinogradGEMM; - using WinogradConv = typename WinogradBase::template Convolution; - using WeightsTransform = typename WinogradBase::template WeightsTransform; - - std::unique_ptr _transform{ nullptr }; - int _num_output_channels; - int _matrix_stride; -}; - -/** Kernel to perform Winograd. */ -template -class CpuWinogradConv2dConfiguration -{ -public: - /** Winograd base kernel */ - using WinogradBase = winograd::WinogradGEMM; - /** Winograd convolution kernel */ - - using WinogradConv = typename WinogradBase::template Convolution; - - using TransformInputKernel = CpuWinogradConv2dTransformInputKernel; - using TransformWeightsKernel = CpuWinogradConv2dTransformWeightsKernel; - using TransformOutputKernel = CpuWinogradConv2dTransformOutputKernel; -}; - -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H*/ diff --git a/src/core/cpu/kernels/activation/list.h b/src/core/cpu/kernels/activation/list.h deleted file mode 100644 index 409d025db0..0000000000 --- a/src/core/cpu/kernels/activation/list.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H -#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H - -namespace arm_compute -{ -namespace cpu -{ -#define DECLARE_ACTIVATION_KERNEL(func_name) \ - void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) - -DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation); -DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation); -DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation); -DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation); -DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation); -DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation); -DECLARE_ACTIVATION_KERNEL(fp16_neon_activation); -DECLARE_ACTIVATION_KERNEL(fp16_sve_activation); -DECLARE_ACTIVATION_KERNEL(fp32_neon_activation); -DECLARE_ACTIVATION_KERNEL(fp32_sve_activation); - -#undef DECLARE_ACTIVATION_KERNEL -} // namespace cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */ diff --git a/src/core/cpu/kernels/activation/neon/fp16.cpp b/src/core/cpu/kernels/activation/neon/fp16.cpp deleted file mode 100644 index 6f2d5d8533..0000000000 --- a/src/core/cpu/kernels/activation/neon/fp16.cpp +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/NEON/NEMath.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include -#include -#include - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -#ifndef __aarch64__ -inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask) -{ - auto int_in = vreinterpretq_u16_f16(in); - return vreinterpretq_f16_u16(wrapper::vand(int_in, mask)); -} -#endif /* __aarch64__ */ -} // namespace - -void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - constexpr int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - // In case of non-aarch64, a small delta value is added to the input - // to prevent NAN values caused by zeros in inputs to SQRT. - // In case of aarh64, we call vsqrt directly, so we don't use delta. -#ifndef __aarch64__ - const auto delta = wrapper::vdup_n(static_cast((1e-7), ExactTagType {})); -#endif /* __aarch64__ */ - - const auto const_1 = wrapper::vdup_n(static_cast(1.f), ExactTagType{}); - const auto const_0 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - const auto const_6 = wrapper::vdup_n(static_cast(6.f), ExactTagType{}); - const auto const_3 = wrapper::vdup_n(static_cast(3.f), ExactTagType{}); - const auto const_inv_6 = wrapper::vdup_n(static_cast(0.166666667f), ExactTagType{}); - - constexpr float soft_relu_thresh = 12.f; - const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast(soft_relu_thresh), ExactTagType{}); - - const auto va = wrapper::vdup_n(static_cast(act_info.a()), ExactTagType{}); - const auto vb = wrapper::vdup_n(static_cast(act_info.b()), ExactTagType{}); - const auto a = static_cast(act_info.a()); - const auto b = static_cast(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - wrapper::traits::neon_bitvector_t tmp; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - switch(act) - { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = wrapper::vabs(vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = wrapper::vmla(vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = wrapper::vmax(const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: -#ifdef __aarch64__ - tmp = wrapper::vsqrt(vin); -#else /* __aarch64__ */ - { - const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{})); - tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); - tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); - } -#endif /* __aarch64__ */ - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = wrapper::vmul(vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - wrapper::vstore(output_ptr + x, tmp); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float16_t in = *(reinterpret_cast(input_ptr + x)); - float16_t tmp; - switch(act) - { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = std::abs(in); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = a * in + b; - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = static_cast(1) / (static_cast(1) + std::exp(-in)); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = std::max(static_cast(0), in); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = std::min(a, std::max(static_cast(0), in)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = std::min(a, std::max(b, in)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = (in > 0) ? in : a * in; - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = (in > soft_relu_thresh) ? in : std::log(static_cast(1) + std::exp(in)); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = (in >= 0) ? in : a * (std::exp(in) - 1); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = std::sqrt(in); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = in * in; - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = a * std::tanh(b * in); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = in; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - *(output_ptr + x) = tmp; - } - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute - -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/core/cpu/kernels/activation/neon/fp32.cpp b/src/core/cpu/kernels/activation/neon/fp32.cpp deleted file mode 100644 index 54301d45ad..0000000000 --- a/src/core/cpu/kernels/activation/neon/fp32.cpp +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -#ifndef __aarch64__ -inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask) -{ - auto int_in = vreinterpretq_u32_f32(in); - return vreinterpretq_f32_u32(wrapper::vand(int_in, mask)); -} -#endif /* __aarch64__ */ -} // namespace - -void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t; - - constexpr int window_step_x = 4; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - // In case of non-aarch64, a small delta value is added to the input - // to prevent NAN values caused by zeros in inputs to SQRT. - // In case of aarh64, we call vsqrt directly, so we don't use delta. -#ifndef __aarch64__ - const auto delta = wrapper::vdup_n(static_cast(1e-24), ExactTagType {}); -#endif /* __aarch64__ */ - const auto const_1 = wrapper::vdup_n(static_cast(1.f), ExactTagType {}); - const auto const_0 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - const auto const_6 = wrapper::vdup_n(static_cast(6.f), ExactTagType{}); - const auto const_3 = wrapper::vdup_n(static_cast(3.f), ExactTagType{}); - const auto const_inv_6 = wrapper::vdup_n(static_cast(0.166666667f), ExactTagType{}); - - constexpr float soft_relu_thresh = 12.f; - const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast(soft_relu_thresh), ExactTagType{}); - - const auto va = wrapper::vdup_n(static_cast(act_info.a()), ExactTagType{}); - const auto vb = wrapper::vdup_n(static_cast(act_info.b()), ExactTagType{}); - const auto a = static_cast(act_info.a()); - const auto b = static_cast(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - wrapper::traits::neon_bitvector_t tmp; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - switch(act) - { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = wrapper::vabs(vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = wrapper::vmla(vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = wrapper::vmax(const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: -#ifdef __aarch64__ - tmp = wrapper::vsqrt(vin); -#else /* __aarch64__ */ - { - const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{})); - tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); - tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); - } -#endif /* __aarch64__ */ - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = wrapper::vmul(vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - wrapper::vstore(output_ptr + x, tmp); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float in = *(reinterpret_cast(input_ptr + x)); - float tmp; - switch(act) - { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = std::abs(in); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = a * in + b; - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = static_cast(1) / (static_cast(1) + std::exp(-in)); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = std::max(static_cast(0), in); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = std::min(a, std::max(static_cast(0), in)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = std::min(a, std::max(b, in)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = (in > 0) ? in : a * in; - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = (in > soft_relu_thresh) ? in : std::log(static_cast(1) + std::exp(in)); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = (in >= 0) ? in : a * (std::exp(in) - 1); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = std::sqrt(in); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = in * in; - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = a * std::tanh(b * in); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = in; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - *(output_ptr + x) = tmp; - } - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/activation/neon/qasymm8.cpp b/src/core/cpu/kernels/activation/neon/qasymm8.cpp deleted file mode 100644 index a1217435b6..0000000000 --- a/src/core/cpu/kernels/activation/neon/qasymm8.cpp +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - constexpr int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); - const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in)); - const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in)); - const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in); - const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in); - const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in); - const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0); - const auto vconst_1 = vdupq_n_f32(1.f); -#ifndef __aarch64__ - const auto vconst_0_f32 = vdupq_n_f32(0); -#endif // __aarch64__ - const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); - const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); - const float a_f32 = act_info.a(); - const float b_f32 = act_info.b(); - const auto const_6_f32 = vdupq_n_f32(6.f); - const auto const_0_f32 = vdupq_n_f32(0.f); - const auto const_3_f32 = vdupq_n_f32(3.f); - const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); - - // Initialise scale/offset for re-quantization - float s = qi_in.scale / qi_out.scale; - float o = -qi_in.offset * s + qi_out.offset; - float32x4_t vs = vdupq_n_f32(s); - float32x4_t vo = vdupq_n_f32(o); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - wrapper::traits::neon_bitvector_t tmp; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = vmaxq_u8(vconst_0, vin); - // Re-quantize to new output space - tmp = vmlaq_qasymm8(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // Perform activation - tmp = vminq_u8(va, vmaxq_u8(vb, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = - { - { - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = - { - { - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = - { - { - wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), - wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), - wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), - wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - const auto vin_deq = vdequantize(vin, qi_in); - -#ifdef __aarch64__ - const uint32x4x4_t pos_mask = - { - { - wrapper::vcgtz(vin_deq.val[0]), - wrapper::vcgtz(vin_deq.val[1]), - wrapper::vcgtz(vin_deq.val[2]), - wrapper::vcgtz(vin_deq.val[3]), - } - }; -#else // __aarch64__ - const uint32x4x4_t pos_mask = - { - { - wrapper::vcgt(vin_deq.val[0], vconst_0_f32), - wrapper::vcgt(vin_deq.val[1], vconst_0_f32), - wrapper::vcgt(vin_deq.val[2], vconst_0_f32), - wrapper::vcgt(vin_deq.val[3], vconst_0_f32), - } - }; -#endif // __aarch64__ - - const float32x4x4_t tmp_dep = - { - { - wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), - wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), - wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), - wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), - } - }; - - tmp = vquantize(tmp_dep, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - wrapper::vstore(output_ptr + x, tmp); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - qasymm8_t in = *(reinterpret_cast(input_ptr + x)); - qasymm8_t tmp = 0; - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - tmp = std::max(const_0, in); - tmp = utility::clamp(tmp * s + o); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - tmp = std::min(a, std::max(const_0, in)); - tmp = utility::clamp(tmp * s + o); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - tmp = std::min(a, std::max(b, in)); - tmp = utility::clamp(tmp * s + o); - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - *(output_ptr + x) = tmp; - } - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp deleted file mode 100644 index 8b40bf8e72..0000000000 --- a/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - constexpr int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); - const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in)); - const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in)); - const qasymm8_signed_t a = quantize_qasymm8_signed(act_info.a(), qi_in); - const qasymm8_signed_t b = quantize_qasymm8_signed(act_info.b(), qi_in); - const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in); - const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0); - const auto vconst_1 = vdupq_n_f32(1.f); -#ifndef __aarch64__ - const auto vconst_0_f32 = vdupq_n_f32(1.f); -#endif // __aarch64__ - const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); - const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); - const float a_f32 = act_info.a(); - const float b_f32 = act_info.b(); - const auto const_6_f32 = vdupq_n_f32(6.f); - const auto const_0_f32 = vdupq_n_f32(0.f); - const auto const_3_f32 = vdupq_n_f32(3.f); - const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); - - // Initialise scale/offset for re-quantization - float s = qi_in.scale / qi_out.scale; - float o = -qi_in.offset * s + qi_out.offset; - float32x4_t vs = vdupq_n_f32(s); - float32x4_t vo = vdupq_n_f32(o); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - wrapper::traits::neon_bitvector_t tmp; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = vmaxq_s8(vconst_0, vin); - // Re-quantize to new output space - tmp = vmlaq_qasymm8_signed(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8_signed(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // Perform activation - tmp = vminq_s8(va, vmaxq_s8(vb, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8_signed(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = - { - { - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), - } - }; - // Re-quantize to new output space - tmp = vquantize_signed(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = - { - { - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), - } - }; - // Re-quantize to new output space - tmp = vquantize_signed(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = - { - { - wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), - wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), - wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), - wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), - } - }; - // Re-quantize to new output space - tmp = vquantize_signed(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - const auto vin_deq = vdequantize(vin, qi_in); - -#ifdef __aarch64__ - const uint32x4x4_t pos_mask = - { - { - wrapper::vcgtz(vin_deq.val[0]), - wrapper::vcgtz(vin_deq.val[1]), - wrapper::vcgtz(vin_deq.val[2]), - wrapper::vcgtz(vin_deq.val[3]), - } - }; -#else // __aarch64__ - const uint32x4x4_t pos_mask = - { - { - wrapper::vcgt(vin_deq.val[0], vconst_0_f32), - wrapper::vcgt(vin_deq.val[1], vconst_0_f32), - wrapper::vcgt(vin_deq.val[2], vconst_0_f32), - wrapper::vcgt(vin_deq.val[3], vconst_0_f32), - } - }; -#endif // __aarch64__ - - const float32x4x4_t tmp_dep = - { - { - wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), - wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), - wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), - wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), - } - }; - - tmp = vquantize_signed(tmp_dep, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - wrapper::vstore(output_ptr + x, tmp); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - qasymm8_signed_t in = *(reinterpret_cast(input_ptr + x)); - qasymm8_signed_t tmp = 0; - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - tmp = std::max(const_0, in); - tmp = utility::clamp(tmp * s + o); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - tmp = std::min(a, std::max(const_0, in)); - tmp = utility::clamp(tmp * s + o); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - tmp = std::min(a, std::max(b, in)); - tmp = utility::clamp(tmp * s + o); - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - *(output_ptr + x) = tmp; - } - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/activation/neon/qsymm16.cpp b/src/core/cpu/kernels/activation/neon/qsymm16.cpp deleted file mode 100644 index 54b41820f2..0000000000 --- a/src/core/cpu/kernels/activation/neon/qsymm16.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/NESymm.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - constexpr int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); - const auto vconst_1 = vdupq_n_f32(1.f); - const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); - const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); - const float a_f32 = act_info.a(); - const float b_f32 = act_info.b(); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - wrapper::traits::neon_bitvector_t tmp; - ARM_COMPUTE_UNUSED(tmp); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = - { - { - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), - } - }; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = - { - { - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), - } - }; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - wrapper::vstore(output_ptr + x, tmp); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - qsymm16_t in = *(reinterpret_cast(input_ptr + x)); - qsymm16_t tmp = 0; - if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - *(output_ptr + x) = tmp; - } - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/activation/sve/fp16.cpp b/src/core/cpu/kernels/activation/sve/fp16.cpp deleted file mode 100644 index 5e76e82c52..0000000000 --- a/src/core/cpu/kernels/activation/sve/fp16.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" - -#include -#include - -#include "src/core/NEON/SVEMath.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - const auto const_1 = svdup_n_f16(1.f); - const auto const_0 = svdup_n_f16(0.f); - const auto const_6 = svdup_n_f16(6.f); - const auto const_3 = svdup_n_f16(3.f); - const auto const_inv_6 = svdup_n_f16(0.166666667f); - - const auto va = svdup_n_f16(act_info.a()); - const auto vb = svdup_n_f16(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - svfloat16_t tmp; - - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do - { - const auto vin = svld1_f16(pg, input_ptr + x); - switch(act) - { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = svabs_f16_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = svmla_f16_z(pg, vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = svmax_f16_z(pg, const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = svsqrt_f16_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = svmul_f16_z(pg, vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3))))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - svst1_f16(pg, output_ptr + x, tmp); - - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); - - } - while(svptest_any(svptrue_b16(), pg)); - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/activation/sve/fp32.cpp b/src/core/cpu/kernels/activation/sve/fp32.cpp deleted file mode 100644 index cb9f82eb39..0000000000 --- a/src/core/cpu/kernels/activation/sve/fp32.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/SVEMath.h" - -#include -#include - -#include - -namespace arm_compute -{ -namespace cpu -{ -void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - const auto const_1 = svdup_n_f32(1.f); - const auto const_0 = svdup_n_f32(0.f); - const auto const_6 = svdup_n_f32(6.f); - const auto const_3 = svdup_n_f32(3.f); - const auto const_inv_6 = svdup_n_f32(0.166666667f); - - const auto va = svdup_n_f32(act_info.a()); - const auto vb = svdup_n_f32(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - svfloat32_t tmp; - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b32(x, window_end_x); - do - { - const auto vin = svld1_f32(pg, input_ptr + x); - switch(act) - { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = svabs_f32_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = svmla_f32_z(pg, vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = svmax_f32_z(pg, const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = svsqrt_f32_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = svmul_f32_z(pg, vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3))))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - svst1_f32(pg, output_ptr + x, tmp); - - x += svcntw(); - pg = svwhilelt_b32(x, window_end_x); - - } - while(svptest_any(svptrue_b32(), pg)); - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/activation/sve/qasymm8.cpp b/src/core/cpu/kernels/activation/sve/qasymm8.cpp deleted file mode 100644 index 69fffd96c5..0000000000 --- a/src/core/cpu/kernels/activation/sve/qasymm8.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ARM_COMPUTE_ENABLE_SVE2) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Window.h" - -#include -#include - -#include "src/core/NEON/SVEAsymm.h" -#include "src/core/NEON/SVEMath.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); - const auto va = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in)); - const auto vb = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in)); - const auto const_0 = quantize_qasymm8(0.f, qi_in); - const auto vconst_0 = svdup_n_u8(const_0); - const auto vconst_1 = svdup_n_f32(1.f); - const auto va_f32 = svdup_n_f32(act_info.a()); - const auto vb_f32 = svdup_n_f32(act_info.b()); - const auto const_6_f32 = svdup_n_f32(6.f); - const auto const_0_f32 = svdup_n_f32(0.f); - const auto const_3_f32 = svdup_n_f32(3.f); - const auto const_inv_6_f32 = svdup_n_f32(0.166666667f); - - // Initialise scale/offset for re-quantization - bool requant = true; - if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) - { - requant = false; - } - float s = qi_in.scale / qi_out.scale; - float o = -qi_in.offset * s + qi_out.offset; - auto vs = svdup_n_f32(s); - auto vo = svdup_n_f32(o); - - // Initialise scale/offset for re-quantization with int32_t - const auto voffset_in = svdup_n_s32(qi_in.offset); - int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - const auto vs_s32 = svdup_n_s32(s_s32); - const auto vo_s32 = svdup_n_s32(o_s32); - - // Initialise scale/offset for re-quantization for leaky relu - int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), - arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); - const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - svuint8_t tmp; - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - const auto vin = svld1_u8(pg, input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = svmax_u8_z(pg, vconst_0, vin); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin)); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // Perform activation - tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin)); - // Re-quantize to new output space - tmp = svmla_qasymm8_z(pg, tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = - { - { { - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))), - } - } - }; - // Re-quantize to new output space - tmp = svquantize_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = - { - { { - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))), - } - } - }; - // Re-quantize to new output space - tmp = svquantize_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = - { - { { - svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))), - } - } - }; - // Re-quantize to new output space - tmp = svquantize_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - svbool_t p0, p1, p2, p3; - svint32x4_t tmp_dep; - - // Expand to int32 - const svint32x4_t vin_s32 = - { - { { - svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))), - svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))), - svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))), - svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))), - } - } - }; - - // Compare elements to input offset - if(qi_in.scale >= 0) - { - p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); - } - else - { - p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); - } - - // Multiply negative elements and requantize if necessary - if(requant) - { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)); - } - else - { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); - } - - // Convert uint32 vectors to uint16 vectors (with saturation) - const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); - const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); - - // convert uint16 vectors to uint8 vectors (with saturation) - tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - - svst1_u8(pg, output_ptr + x, tmp); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - - } - while(svptest_any(svptrue_b8(), pg)); - - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp deleted file mode 100644 index 53ee515ff9..0000000000 --- a/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include -#include - -#if defined(ARM_COMPUTE_ENABLE_SVE2) -#include "src/core/NEON/SVEAsymm.h" -#include "src/core/NEON/SVEMath.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); - const auto va = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in)); - const auto vb = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in)); - const auto const_0 = quantize_qasymm8_signed(0.f, qi_in); - const auto vconst_0 = svdup_n_s8(const_0); - const auto vconst_1 = svdup_n_f32(1.f); - const auto va_f32 = svdup_n_f32(act_info.a()); - const auto vb_f32 = svdup_n_f32(act_info.b()); - const auto const_6_f32 = svdup_n_f32(6.f); - const auto const_0_f32 = svdup_n_f32(0.f); - const auto const_3_f32 = svdup_n_f32(3.f); - const auto const_inv_6_f32 = svdup_n_f32(0.166666667f); - - // Initialise scale/offset for re-quantization - bool requant = true; - if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) - { - requant = false; - } - float s = qi_in.scale / qi_out.scale; - float o = -qi_in.offset * s + qi_out.offset; - auto vs = svdup_n_f32(s); - auto vo = svdup_n_f32(o); - - // Initialise scale/offset for re-quantization with int32_t - const auto voffset_in = svdup_n_s32(qi_in.offset); - int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - const auto vs_s32 = svdup_n_s32(s_s32); - const auto vo_s32 = svdup_n_s32(o_s32); - - // Initialise scale/offset for re-quantization for leaky relu - int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), - arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); - const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - svint8_t tmp; - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - const auto vin = svld1_s8(pg, input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = svmax_s8_z(pg, vconst_0, vin); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin)); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // Perform activation - tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin)); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = - { - { { - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))), - } - } - }; - // Re-quantize to new output space - tmp = svquantize_signed_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = - { - { { - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))), - } - } - }; - // Re-quantize to new output space - tmp = svquantize_signed_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = - { - { { - svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))), - } - } - }; - // Re-quantize to new output space - tmp = svquantize_signed_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - svbool_t p0, p1, p2, p3; - svint32x4_t tmp_dep; - - // Expand to int32 - const svint32x4_t vin_s32 = - { - { { - svmovlb_s32(svmovlb_s16(vin)), - svmovlt_s32(svmovlb_s16(vin)), - svmovlb_s32(svmovlt_s16(vin)), - svmovlt_s32(svmovlt_s16(vin)), - } - } - }; - - // Compare elements to input offset - if(qi_in.scale >= 0) - { - p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); - } - else - { - p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); - } - - // Multiply negative elements and requantize if necessary - if(requant) - { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)); - } - else - { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); - } - - // Convert uint32 vectors to uint16 vectors (with saturation) - const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); - const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); - - // convert uint16 vectors to uint8 vectors (with saturation) - tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - - svst1_s8(pg, output_ptr + x, tmp); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - - } - while(svptest_any(svptrue_b8(), pg)); - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ diff --git a/src/core/cpu/kernels/activation/sve/qsymm16.cpp b/src/core/cpu/kernels/activation/sve/qsymm16.cpp deleted file mode 100644 index ac549770a2..0000000000 --- a/src/core/cpu/kernels/activation/sve/qsymm16.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/experimental/Types.h" - -#include -#include - -#if defined(ARM_COMPUTE_ENABLE_SVE2) -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/SVESymm.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) -{ - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const ActivationLayerInfo::ActivationFunction act = act_info.activation(); - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - - const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); - const auto vconst_1 = svdup_n_f32(1.f); - const auto va_f32 = svdup_n_f32(act_info.a()); - const auto vb_f32 = svdup_n_f32(act_info.b()); - - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - svint16_t tmp; - - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do - { - const auto vin = svld1_s16(pg, input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); - // Perform activation - const svfloat32x2_t tmp_dep = - { - { { - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))), - } - } - }; - // Re-quantize to new output space - tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); - // Perform activation - const svfloat32x2_t tmp_dep = - { - { { - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))), - } - } - }; - // Re-quantize to new output space - tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - - svst1_s16(pg, output_ptr + x, tmp); - - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); - - } - while(svptest_any(svptrue_b16(), pg)); - }, - input, output); -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ diff --git a/src/core/cpu/kernels/add/neon/list.h b/src/core/cpu/kernels/add/neon/list.h deleted file mode 100644 index 379bd32fb1..0000000000 --- a/src/core/cpu/kernels/add/neon/list.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_ADD_LIST_H -#define SRC_CORE_NEON_KERNELS_ADD_LIST_H - -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/wrapper.h" - -namespace arm_compute -{ -namespace cpu -{ -#define DECLARE_ADD_KERNEL(func_name) \ - void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) - -DECLARE_ADD_KERNEL(add_qasymm8_neon); -DECLARE_ADD_KERNEL(add_qasymm8_signed_neon); -DECLARE_ADD_KERNEL(add_qsymm16_neon); - -#undef DECLARE_ADD_KERNEL - -template -void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - constexpr int window_step_x = 16 / sizeof(ScalarType); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v; - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto val1 = wrapper::vloadq(input1_ptr + x); - const auto val2 = wrapper::vloadq(input2_ptr + x); - const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto val1 = *(input1_ptr + x); - const auto val2 = *(input2_ptr + x); - *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; - } - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_ADD_LIST_H diff --git a/src/core/cpu/kernels/add/neon/qasymm8.cpp b/src/core/cpu/kernels/add/neon/qasymm8.cpp deleted file mode 100644 index e357a7ef7f..0000000000 --- a/src/core/cpu/kernels/add/neon/qasymm8.cpp +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - ARM_COMPUTE_UNUSED(policy); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); - const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); - const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); - const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); - - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2); - const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2); - const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2); - - const float bfs = static_cast(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1); - const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1); - const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; - -#ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); - rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); - rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); -#else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); - rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); - rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); -#endif //__aarch64__ - - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); - const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); - const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); - const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t a = vld1q_u8(input1_ptr + x); - const uint8x16_t b = vld1q_u8(input2_ptr + x); - - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1); - const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1); - const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1); - - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2); - const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2); - const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; - -#ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); - rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); - rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); -#else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); - rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); - rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); -#endif //__aarch64__ - - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; - *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info); - } - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp deleted file mode 100644 index d62d0739f5..0000000000 --- a/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - ARM_COMPUTE_UNUSED(policy); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); - const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); - const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); - const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value); - - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2); - const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2); - const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2); - const float bfs = static_cast(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x); - - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1); - const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1); - const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; - -#ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); - rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); - rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); -#else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); - rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); - rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); -#endif //__aarch64__ - - const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); - const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); - const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); - const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int8x16_t a = vld1q_s8(input1_ptr + x); - const int8x16_t b = vld1q_s8(input2_ptr + x); - - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1); - const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1); - const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1); - - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2); - const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2); - const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; - -#ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); - rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); - rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); -#else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); - rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); - rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); -#endif //__aarch64__ - - const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; - *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), dst->info()->quantization_info()); - } - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/add/neon/qsymm16.cpp b/src/core/cpu/kernels/add/neon/qsymm16.cpp deleted file mode 100644 index e76e408d6e..0000000000 --- a/src/core/cpu/kernels/add/neon/qsymm16.cpp +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - ARM_COMPUTE_UNUSED(policy); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); - const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); - - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2); - const float bfs = static_cast(broadcast_value) * broadcast_qinfo.scale; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; -#ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); -#else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); -#endif //__aarch64__ - - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8_t a = vld1q_s16(input1_ptr + x); - const int16x8_t b = vld1q_s16(input2_ptr + x); - - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; -#ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); -#else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); -#endif //__aarch64__ - - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x))) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x))) * iq2_info.scale; - *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info()); - } - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/impl.cpp b/src/core/cpu/kernels/add/sve/impl.cpp deleted file mode 100644 index cf9e301c29..0000000000 --- a/src/core/cpu/kernels/add/sve/impl.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" - -#include "src/core/NEON/SVEMath.h" -#include "src/core/cpu/kernels/add/sve/impl.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -template -void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - const auto all_true_pg = wrapper::svptrue(); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - const bool is_sat = (policy == ConvertPolicy::SATURATE); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); - Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); - Iterator output(dst, window); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto val1 = svld1(pg, input1_ptr + x); - const auto val2 = svld1(pg, input2_ptr + x); - const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} - -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/impl.h b/src/core/cpu/kernels/add/sve/impl.h deleted file mode 100644 index 32ff5d0496..0000000000 --- a/src/core/cpu/kernels/add/sve/impl.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H -#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H - -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" - -namespace arm_compute -{ -namespace cpu -{ -template -void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -} // namespace cpu -} // namespace arm_compute -#endif // defined(ARM_COMPUTE_ENABLE_SVE) -#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h deleted file mode 100644 index 4d29c2a8f1..0000000000 --- a/src/core/cpu/kernels/add/sve/list.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H -#define SRC_CORE_SVE_KERNELS_ADD_LIST_H - -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/cpu/kernels/add/sve/impl.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -#define DECLARE_ADD_KERNEL(func_name) \ - void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) - -DECLARE_ADD_KERNEL(add_qasymm8_sve); -DECLARE_ADD_KERNEL(add_qasymm8_signed_sve); -DECLARE_ADD_KERNEL(add_qsymm16_sve); - -#undef DECLARE_ADD_KERNEL - -} // namespace cpu -} // namespace arm_compute -#endif // defined(ARM_COMPUTE_ENABLE_SVE) -#endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/qasymm8.cpp b/src/core/cpu/kernels/add/sve/qasymm8.cpp deleted file mode 100644 index 888ad878ca..0000000000 --- a/src/core/cpu/kernels/add/sve/qasymm8.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ARM_COMPUTE_ENABLE_SVE2) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -void add_qasymm8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - ARM_COMPUTE_UNUSED(policy); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - const auto all_true_pg = svptrue_b8(); - - const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); - const auto voffseto = svdup_n_f32(oq_info.offset); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - - const svfloat32_t vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale); - const svfloat32_t vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale); - const svint32_t voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset); - const svint32_t voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2); - - do - { - const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x); - - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1); - - const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); - const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); - - const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); - svst1_u8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const auto vscale1 = svdup_n_f32(iq1_info.scale); - const auto vscale2 = svdup_n_f32(iq2_info.scale); - const auto voffset1 = svdup_n_s32(iq1_info.offset); - const auto voffset2 = svdup_n_s32(iq2_info.offset); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - const auto a = svld1_u8(pg, input1_ptr + x); - const auto b = svld1_u8(pg, input2_ptr + x); - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2); - - const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); - const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); - const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); - - svst1_u8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp deleted file mode 100644 index 3b922c6c21..0000000000 --- a/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ARM_COMPUTE_ENABLE_SVE2) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -void add_qasymm8_signed_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - ARM_COMPUTE_UNUSED(policy); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); - const auto voffseto = svdup_n_f32(oq_info.offset); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - const auto all_true_pg = svptrue_b8(); - - const auto vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale); - const auto vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale); - const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset); - const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = svdup_n_s8(broadcast_value); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2); - - do - { - const auto a = svld1_s8(pg, non_broadcast_input_ptr + x); - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); - const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); - - svst1_s8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - const auto vscale1 = svdup_n_f32(iq1_info.scale); - const auto vscale2 = svdup_n_f32(iq2_info.scale); - const auto voffset1 = svdup_n_s32(iq1_info.offset); - const auto voffset2 = svdup_n_s32(iq2_info.offset); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - const auto a = svld1_s8(pg, input1_ptr + x); - const auto b = svld1_s8(pg, input2_ptr + x); - - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); - const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); - - svst1_s8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/qsymm16.cpp b/src/core/cpu/kernels/add/sve/qsymm16.cpp deleted file mode 100644 index eef5d245d3..0000000000 --- a/src/core/cpu/kernels/add/sve/qsymm16.cpp +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ARM_COMPUTE_ENABLE_SVE2) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -void add_qsymm16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - ARM_COMPUTE_UNUSED(policy); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - const auto vscale1 = svdup_n_f32(iq1_info.scale); - const auto vscale2 = svdup_n_f32(iq2_info.scale); - const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); - const auto all_true_pg = svptrue_b16(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = svdup_n_s16(broadcast_value); - - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2); - - do - { - const auto a = svld1_s16(pg, non_broadcast_input_ptr + x); - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - - const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - - svst1_s16(pg, output_ptr + x, res); - - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do - { - auto a = svld1_s16(pg, input1_ptr + x); - auto b = svld1_s16(pg, input2_ptr + x); - - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - - const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - svst1_s16(pg, output_ptr + x, res); - - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h deleted file mode 100644 index 4b7b092d01..0000000000 --- a/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H -#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H - -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/NEON/INEKernel.h" -#include "src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp" - -#include "gemm_common.hpp" - -namespace arm_compute -{ -class ITensor; - -namespace cpu -{ -namespace kernel -{ -/** This class is a wrapper for the assembly kernels. - * - * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55. - * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance - * of CpuGemmAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel - * in the context of an NEFunctions. - * - * The type T is the type of the actual kernel implemented in assembly which is of type - * template class GemmCommon - * - * - */ -template -class CpuGemmAssemblyWrapperKernel final : public INEKernel -{ -public: - /** Constructor - */ - CpuGemmAssemblyWrapperKernel() - : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel") - { - } - - CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete; - CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default; - CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete; - - const char *name() const override - { - return _name.c_str(); - } - - void run(const Window &window, const ThreadInfo &info) override - { - ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(_kernel))); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - - auto win = arm_gemm::to_ndcoord(window); - - arm_gemm::ndcoord_t thread_locator{}; - - _kernel->execute(win, thread_locator, info.thread_id); - } - - // Inherited methods overridden: - void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override - { - ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(_kernel))); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - - //convert between arm_compute and arm_gemm types - auto ndc_win = arm_gemm::to_ndcoord(window); - auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator); - - _kernel->execute(ndc_win, ndc_tlc, info.thread_id); - } - - /** Initialise the kernel's input and output. - * - * @param[in] kernel Pointer to an assembly kernel implementation. - * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name. - */ - void configure(arm_gemm::GemmCommon *kernel, std::string kernel_name_tag) - { - ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(kernel))); - _kernel = kernel; - - Window win = to_window(kernel->get_window_size()); - - INEKernel::configure(win); - - if(!kernel_name_tag.empty()) - { - _name += "/" + kernel_name_tag; - } - } - -private: - arm_gemm::GemmCommon *_kernel; - std::string _name; -}; -} // namespace kernel -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */ diff --git a/src/core/cpu/kernels/assembly/arm_gemm.hpp b/src/core/cpu/kernels/assembly/arm_gemm.hpp deleted file mode 100644 index e38cc09202..0000000000 --- a/src/core/cpu/kernels/assembly/arm_gemm.hpp +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include -#include -#include - -#include "arm_gemm_local.hpp" -#include "gemm_common.hpp" - -namespace arm_gemm -{ -enum class GemmMethod -{ - DEFAULT, - GEMV_BATCHED, - GEMV_PRETRANSPOSED, - GEMV_NATIVE_TRANSPOSED, - GEMM_NATIVE, - GEMM_HYBRID, - GEMM_INTERLEAVED, - GEMM_INTERLEAVED_2D, - QUANTIZE_WRAPPER, - QUANTIZE_WRAPPER_2D, - GEMM_HYBRID_QUANTIZED -}; - -struct KernelDescription -{ - GemmMethod method = GemmMethod::DEFAULT; - std::string name = ""; - bool is_default = false; - uint64_t cycle_estimate = 0; - - KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0) - : method(m), name(n), is_default(d), cycle_estimate(c) - { - } - KernelDescription() noexcept - { - } -}; - -struct GemmConfig -{ - GemmMethod method = GemmMethod::DEFAULT; - std::string filter = ""; - unsigned int inner_block_size = 0; - unsigned int outer_block_size = 0; - - GemmConfig(GemmMethod method) - : method(method) - { - } - GemmConfig() - { - } -}; - -struct Activation -{ - enum class Type - { - None, - ReLU, - BoundedReLU - }; - - Type type; - float param1; - float param2; - - Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) - : type(type), param1(p1), param2(p2) - { - } -}; - -struct GemmArgs -{ -public: - const CPUInfo *_ci; - unsigned int _Msize; - unsigned int _Nsize; - unsigned int _Ksize; - unsigned int _Ksections; - unsigned int _nbatches; - unsigned int _nmulti; - bool _indirect_input; - Activation _act; - int _maxthreads; - bool _fast_mode; - const GemmConfig *_cfg; - - GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N, - unsigned int K, unsigned int Ksections, unsigned int nbatches, - unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads, - bool fast_mode = false, const GemmConfig *cfg = nullptr) - : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _fast_mode(fast_mode), - _cfg(cfg) - { - } -}; - -struct Requantize32 -{ -public: - const int32_t *bias = nullptr; - size_t bias_multi_stride = 0; - int32_t a_offset = 0; - int32_t b_offset = 0; - int32_t c_offset = 0; - bool per_channel_requant = false; - int32_t per_layer_left_shift = 0; - int32_t per_layer_right_shift = 0; - int32_t per_layer_mul = 0; - const int32_t *per_channel_left_shifts = nullptr; - const int32_t *per_channel_right_shifts = nullptr; - const int32_t *per_channel_muls = nullptr; - int32_t minval = 0; - int32_t maxval = 0; - - Requantize32() = default; - - // Constructor for per-tensor quantization - Requantize32(const int32_t *bias, size_t bias_multi_stride, - int32_t a_offset, int32_t b_offset, int32_t c_offset, - int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv) - : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max(requant_shift, 0)), - per_layer_right_shift(std::min(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv) - { - } - - // Constructor for per-channel quantization - Requantize32(const int32_t *bias, size_t bias_multi_stride, - int32_t a_offset, int32_t b_offset, int32_t c_offset, - const int32_t *requant_left_shifts, - const int32_t *requant_right_shifts, - const int32_t *requant_muls, - int32_t minv, int32_t maxv) - : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts), - per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv) - { - } -}; - -struct Nothing -{ -}; - -template -using UniqueGemmCommon = std::unique_ptr>; - -/* Low level API calls. - * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */ - -/* get_gemm_method(): Given the templated types and provided parameters, - * which is the preferred method to implement this GEMM? */ -template -KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {}); - -template -UniqueGemmCommon gemm(const GemmArgs &args, const OutputStage & = {}); - -template -std::vector get_compatible_kernels(const GemmArgs &args, const OutputStage & = {}); - -} // namespace arm_gemm diff --git a/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp deleted file mode 100644 index 718fcd1fb4..0000000000 --- a/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include "arm_compute/core/Dimensions.h" -#include "arm_compute/core/Window.h" - -#include "ndrange.hpp" - -#include - -/* This file contains mapping between integral types used in arm_compute and arm_gemm - * These two codebases both require a degree of separation for the sake of modularity - * so maintain their own types which represent similar information. - */ - -namespace arm_gemm -{ -//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library -constexpr std::size_t ndrange_max = - arm_compute::Dimensions::num_max_dimensions; - -using ndrange_t = NDRange; -using ndcoord_t = NDCoordinate; - -/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window` - * - * As `NDRange` does not not encode start positions, we specify - * the start to be zero in the produced `arm_compute::Window` - * - * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window` - * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr` - */ -inline arm_compute::Window to_window(const ndrange_t &ndr) -{ - arm_compute::Window win; - - for(unsigned int i = 0; i != ndrange_max; ++i) - { - //populate the window with the dimensions of the NDRange - win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i))); - } - - return win; -} - -/* - * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window` - * - * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window` - * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc` - */ -inline arm_compute::Window to_window(const ndcoord_t &ndc) -{ - arm_compute::Window win; - - for(unsigned int i = 0; i != ndrange_max; ++i) - { - const auto start = ndc.get_position(i); - const auto size = ndc.get_size(i); - const auto stop = start + size; - - //populate the window with the dimensions of the NDRange - win.set(i, arm_compute::Window::Dimension(start, stop)); - } - - return win; -} - -/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions - * - * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()` - * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range - * - * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t` - * @return the resultant ndrange_t - */ -inline ndrange_t to_ndrange(const arm_compute::Window &win) -{ - return - { - static_cast(win[0].end() - win[0].start()), - static_cast(win[1].end() - win[1].start()), - static_cast(win[2].end() - win[2].start()), - static_cast(win[3].end() - win[3].start()), - static_cast(win[4].end() - win[4].start()), - static_cast(win[5].end() - win[5].start()) - }; -} - -/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions - * - * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t` - * @return the resultant ndcoord_t - */ -inline ndcoord_t to_ndcoord(const arm_compute::Window &win) -{ - return - { - { static_cast(win[0].start()), static_cast(win[0].end() - win[0].start()) }, - { static_cast(win[1].start()), static_cast(win[1].end() - win[1].start()) }, - { static_cast(win[2].start()), static_cast(win[2].end() - win[2].start()) }, - { static_cast(win[3].start()), static_cast(win[3].end() - win[3].start()) }, - { static_cast(win[4].start()), static_cast(win[4].end() - win[4].start()) }, - { static_cast(win[5].start()), static_cast(win[5].end() - win[5].start()) } - }; -} - -} //namespace arm_gemm diff --git a/src/core/cpu/kernels/assembly/arm_gemm_local.hpp b/src/core/cpu/kernels/assembly/arm_gemm_local.hpp deleted file mode 100644 index 78e0adf31f..0000000000 --- a/src/core/cpu/kernels/assembly/arm_gemm_local.hpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -/* This file is used to configure integration-specific aspects of arm_gemm into ACL */ - -#include "arm_compute/core/CPP/CPPTypes.h" - -using CPUModel = arm_compute::CPUModel; -using CPUInfo = arm_compute::CPUInfo; diff --git a/src/core/cpu/kernels/assembly/convolution_parameters.hpp b/src/core/cpu/kernels/assembly/convolution_parameters.hpp deleted file mode 100644 index 0c1ae58902..0000000000 --- a/src/core/cpu/kernels/assembly/convolution_parameters.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include - -namespace arm_gemm -{ -/* - * Parameter set for "convolution" type GEMM. - * - * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if - * an im2row had been performed on the input tensor to generate the operand - * matrix, but instead this structure describes the convolution parameters - * such that this can be done on the fly. - * - * The parameters describe the convolution details - the notional shape of - * the input and output tensors, whether padding is to be applied, the size - * of the kernel and a constant value to be used for padding (needed for - * quantized tensors). - * - * The second part describes the layout of the input tensor in memory, which - * is assumed to be in NHWC format. This consists of a base pointer and - * strides for columns, rows and batches. 'multis' are not supported for - * convolution type GEMMs. - */ -struct ConvolutionParameters -{ - int64_t input_width; - int64_t input_height; - int64_t input_channels; - int64_t kernel_width; - int64_t kernel_height; - int64_t output_width; - int64_t output_height; - int64_t output_stride_w; - int64_t output_stride_h; - // output_channels not included as they do not affect the input. - int64_t padding_top; - int64_t padding_left; - float padding_value; -}; - -} // namespace arm_gemm diff --git a/src/core/cpu/kernels/assembly/gemm_common.hpp b/src/core/cpu/kernels/assembly/gemm_common.hpp deleted file mode 100644 index 378f1041be..0000000000 --- a/src/core/cpu/kernels/assembly/gemm_common.hpp +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include "convolution_parameters.hpp" -#include "ndrange.hpp" - -#include - -namespace arm_gemm -{ -// Avoid circular dependency with arm_gemm.hpp -struct GemmConfig; - -// Abstract class for the GEMM/GEMV functions. -// -// GEMM implementations may be "native" (never require any input -// permutation), "pretransposed" (require permutation up-front) or require -// working space (permute as they go along). This interface should support -// all of them. - -// The real GemmCommon class is templated based on the operand and return -// type. This is an interface class which is independent of those types. -class IGemmCommon -{ -public: - /* Pass in the pointers to the arrays to be operated on and their - * strides. This "generic" version uses void *s, the preferred version - * is the one provided by templated GemmCommon (below) which takes - * appropriately typed pointers. If B is pretransposed (see below) then - * the settings for B here are ignored. - */ - virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; - - /** @returns an ndrange containing ranges of the compute space which can be - * broken up and parallelised over - */ - virtual ndrange_t get_window_size() const = 0; - - /* The maximum thread count is specified when the GEMM is created. Some - * implementations need to know how many threads will actually run in - * order to work properly. - * - * In some cases, after creating the GEMM the number of threads needs to - * be reduced (e.g. not enough work to split across threads). This - * method allows the number of actual threads to be run to be set (must - * be equal or lower). - * - * This has an empty default implementation, as GEMMs which don't care - * about thread count can safely ignore this. - */ - virtual void set_nthreads(int) {}; - - /* Whether this GEMM can be dynamically scheduled or not. */ - virtual bool supports_dynamic_scheduling() const - { - return false; - } - - /** Main execute member fucntion - * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size() - * @param [in] thread_locator where are we inside of the thread space - * @param [in] threadid a unique threadid - */ - virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0; - - /*** Working space interface (optional) ***/ - /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */ - virtual size_t get_working_size() const - { - return 0; - } - /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ - virtual void set_working_space(void *) {}; - - /*** "Pretransposed" interface (optional) ***/ - /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ - virtual bool B_is_pretransposed() const - { - return false; - } - /* Does pretranspose still need to be done? */ - virtual bool B_pretranspose_required() const - { - return false; - } - /* Total number of bytes of space needed for pretransposed arrays. */ - virtual size_t get_B_pretransposed_array_size() const - { - return 0; - } - /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */ - /* The "real" version of this depends on the templated operand type (see below). */ - virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0; - /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ - virtual void set_pretransposed_B_data(void *) - { - } - - /*** "Quantized bias" interface (optional) ***/ - /* Set the bias vector for quantized GEMMs */ - virtual void set_quantized_bias(const int32_t *, size_t) - { - } - - /*** Indirect interface (optional) ***/ - /* Set the indirect table. This comprises a number of values per kernel point, and a densely packed array of pointers, - * multis * batches * kernel_points */ - virtual void set_indirect_parameters_generic(size_t, const void *const *const *) - { - } - - /*** Convolution interface (optional) ***/ - /* Set the convolution parameters. */ - virtual void set_convolution_parameters(ConvolutionParameters) - { - } - - /*** Introspection interface ***/ - /* Get the configuration of this GEMM */ - virtual GemmConfig get_config() = 0; - - // Destructor - virtual ~IGemmCommon() - { - } -}; - -/* "Real" GemmCommon class which is templated on the operand and return types. - * - * In addition to correctly typed versions of the functions that operate on - * operand and return data, this class provides a default implementation of - * 'set_arrays' to capture the provided arguments in protected class - * members, as essentially any implementation will need these. - */ -template -class GemmCommon : public IGemmCommon -{ -protected: - const To *_Aptr = nullptr; - int _lda = 0; - int _A_batch_stride = 0; - int _A_multi_stride = 0; - const To *_Bptr = nullptr; - int _ldb = 0; - int _B_multi_stride = 0; - Tr *_Cptr = nullptr; - int _ldc = 0; - int _C_batch_stride = 0; - int _C_multi_stride = 0; - const Tr *_bias = nullptr; - int _bias_multi_stride = 0; - -public: - /* Pass in the pointers to the arrays to be operated on and their - * strides (templated version with appropriate types). */ - virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const To *B, const int ldb, /* batches share B */ const int B_multi_stride, - Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) - { - _Aptr = A; - _lda = lda; - _A_batch_stride = A_batch_stride; - _A_multi_stride = A_multi_stride; - _Bptr = B; - _ldb = ldb; - _B_multi_stride = B_multi_stride; - _Cptr = C; - _ldc = ldc; - _C_batch_stride = C_batch_stride; - _C_multi_stride = C_multi_stride; - _bias = bias; - _bias_multi_stride = bias_multi_stride; - } - - /* Implementation of the void * overload which casts its arguments to the appropriate type. */ - void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override - { - set_arrays(static_cast(A), lda, A_batch_stride, A_multi_stride, - static_cast(B), ldb, B_multi_stride, - static_cast(C), ldc, C_batch_stride, C_multi_stride, - static_cast(bias), bias_multi_stride); - } - - /*** "Pretransposed" interface ***/ - - /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ - /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ - virtual void pretranspose_B_array(void *, const To *, const int, const int) {}; - - /* Implementation of the void * overload which casts its arguments to the appropriate type. */ - void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override - { - pretranspose_B_array(out, static_cast(in), row_stride, multi_stride); - } - - /*** Indirect interface ***/ - virtual void set_indirect_parameters(size_t, const To *const *const *) - { - } - - void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override - { - set_indirect_parameters(sz, reinterpret_cast(ptr)); - } -}; - -} // namespace arm_gemm diff --git a/src/core/cpu/kernels/assembly/ndrange.hpp b/src/core/cpu/kernels/assembly/ndrange.hpp deleted file mode 100644 index 1c8261aef7..0000000000 --- a/src/core/cpu/kernels/assembly/ndrange.hpp +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -#include -#include -#include -#include - -namespace arm_gemm -{ -template -class NDRange -{ -private: - std::array m_sizes{}; - std::array m_totalsizes{}; - - class NDRangeIterator - { - private: - const NDRange &m_parent; - unsigned int m_pos = 0; - unsigned int m_end = 0; - - public: - NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) - : m_parent(p), m_pos(s), m_end(e) - { - } - - bool done() const - { - return (m_pos >= m_end); - } - - unsigned int dim(unsigned int d) const - { - unsigned int r = m_pos; - - if(d < (D - 1)) - { - r %= m_parent.m_totalsizes[d]; - } - - if(d > 0) - { - r /= m_parent.m_totalsizes[d - 1]; - } - - return r; - } - - bool next_dim0() - { - m_pos++; - - return !done(); - } - - bool next_dim1() - { - m_pos += m_parent.m_sizes[0] - dim(0); - - return !done(); - } - - unsigned int dim0_max() const - { - unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0)); - - return dim(0) + offset; - } - }; - - void set_totalsizes() - { - unsigned int t = 1; - - for(unsigned int i = 0; i < D; i++) - { - if(m_sizes[i] == 0) - { - m_sizes[i] = 1; - } - - t *= m_sizes[i]; - - m_totalsizes[i] = t; - } - } - -public: - NDRange &operator=(const NDRange &rhs) = default; - NDRange(const NDRange &rhs) = default; - - template - NDRange(T... ts) - : m_sizes{ ts... } - { - set_totalsizes(); - } - - NDRange(const std::array &n) - : m_sizes(n) - { - set_totalsizes(); - } - - NDRangeIterator iterator(unsigned int start, unsigned int end) const - { - return NDRangeIterator(*this, start, end); - } - - unsigned int total_size() const - { - return m_totalsizes[D - 1]; - } - - unsigned int get_size(unsigned int v) const - { - return m_sizes[v]; - } -}; - -/** NDCoordinate builds upon a range, but specifies a starting position - * in addition to a size which it inherits from NDRange - */ -template -class NDCoordinate : public NDRange -{ - using int_t = unsigned int; - using ndrange_t = NDRange; - - std::array m_positions{}; - -public: - NDCoordinate &operator=(const NDCoordinate &rhs) = default; - NDCoordinate(const NDCoordinate &rhs) = default; - NDCoordinate(const std::initializer_list> &list) - { - std::array sizes{}; - - std::size_t i = 0; - for(auto &p : list) - { - m_positions[i] = p.first; - sizes[i++] = p.second; - } - - //update the parents sizes - static_cast(*this) = ndrange_t(sizes); - } - - int_t get_position(int_t d) const - { - assert(d < N); - - return m_positions[d]; - } - - void set_position(int_t d, int_t v) - { - assert(d < N); - - m_positions[d] = v; - } - - int_t get_position_end(int_t d) const - { - return get_position(d) + ndrange_t::get_size(d); - } -}; //class NDCoordinate - -using ndrange_t = NDRange<6>; -using ndcoord_t = NDCoordinate<6>; - -} // namespace arm_gemm diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_list.h deleted file mode 100644 index 43e44be5e2..0000000000 --- a/src/core/cpu/kernels/elementwise/neon/elementwise_list.h +++ /dev/null @@ -1,486 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H -#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H - -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -template -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), - int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), - int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = std::min(16 / static_cast(sizeof(OutputScalarType)), 8); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const auto a = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); - for(; x < window_end_x; ++x) - { - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = (*scalar_func)(a, b); - } - }, - input1, input2, output); - } -} - -template -inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b) -{ - auto res = ScalarType(0); - - switch(op) - { - case ArithmeticOperation::MAX: - res = std::max(a, b); - break; - case ArithmeticOperation::MIN: - res = std::min(a, b); - break; - case ArithmeticOperation::SQUARED_DIFF: - { - res = (a - b) * (a - b); - break; - } - case ArithmeticOperation::PRELU: - { - res = (a > 0 ? a : a * b); - break; - } - case ArithmeticOperation::DIV: - { - res = a / b; - if(std::is_integral::value) - { - res = (b == 0) ? 0 : res; - if(static_cast(a) % static_cast(b) != 0 && ((a < 0) != (b < 0))) - { - --res; - } - } - break; - } - case ArithmeticOperation::POWER: - { - res = std::pow(a, b); - break; - } - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res; -} - -template -inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b) -{ - using vec_type = typename VectorType::type; - using scalar_type = typename VectorType::scalar_type; - using tag_type = typename VectorType::tag_type; - - vec_type res = wrapper::vdup_n(static_cast(0), tag_type{}); - - switch(op) - { - case ArithmeticOperation::MAX: - res = wrapper::vmax(a, b); - break; - case ArithmeticOperation::MIN: - res = wrapper::vmin(a, b); - break; - case ArithmeticOperation::SQUARED_DIFF: - { - const vec_type tmp = wrapper::vsub(a, b); - res = wrapper::vmul(tmp, tmp); - break; - } - case ArithmeticOperation::PRELU: - { - const vec_type zero = wrapper::vdup_n(static_cast(0), tag_type{}); - const vec_type tmp = wrapper::vmul(a, b); - const auto gt = wrapper::vcgt(a, zero); - - res = wrapper::vbsl(gt, a, tmp); - break; - } - - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - return res; -} - -template <> -inline int32x4_t elementwise_arithm_op>(const int32x4_t &a, const int32x4_t &b) -{ - return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b)))); -} - -template <> -inline float32x4_t elementwise_arithm_op>(const float32x4_t &a, const float32x4_t &b) -{ - return wrapper::vdiv(a, b); -} - -template <> -inline float32x4_t elementwise_arithm_op>(const float32x4_t &a, const float32x4_t &b) -{ - return wrapper::vpow(a, b); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline float16x8_t elementwise_arithm_op>(const float16x8_t &a, const float16x8_t &b) -{ - return wrapper::vdiv(a, b); -} - -template <> -inline float16x8_t elementwise_arithm_op>(const float16x8_t &a, const float16x8_t &b) -{ - return wrapper::vpow(a, b); -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template -inline typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder) -{ - using tag_type = typename VectorType::tag_type; - using vec_type = typename VectorType::type; - - vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{}); - return elementwise_arithm_op(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); -} - -template -inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - wrapper::vstore(output_ptr + x, elementwise_arithm_op(a, b)); - } - return x; -} - -template -inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); - wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast(a, broadcast_value, reorder)); - } - return x; -} - -template -void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using scalar_type = typename VectorType::scalar_type; - - elementwise_op(in1, in2, out, window, - &elementwise_arithm_op_scalar, - &elementwise_arithm_op_broadcast_loop, - &elementwise_arithm_op_loop); -} - -template -inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b) -{ - bool res = false; - - switch(op) - { - case ComparisonOperation::Equal: - res = (a == b); - break; - case ComparisonOperation::NotEqual: - res = (a != b); - break; - case ComparisonOperation::Greater: - res = (a > b); - break; - case ComparisonOperation::GreaterEqual: - res = (a >= b); - break; - case ComparisonOperation::Less: - res = (a < b); - break; - case ComparisonOperation::LessEqual: - res = (a <= b); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res ? ~static_cast(0) : static_cast(0); -} - -template -inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b) -{ - OutputVectorType res = { 0, 0, 0, 0 }; - - switch(op) - { - case ComparisonOperation::Equal: - res = wrapper::vceq(a, b); - break; - case ComparisonOperation::NotEqual: - res = wrapper::vnot(wrapper::vceq(a, b)); - break; - case ComparisonOperation::Greater: - res = wrapper::vcgt(a, b); - break; - case ComparisonOperation::GreaterEqual: - res = wrapper::vcge(a, b); - break; - case ComparisonOperation::Less: - res = wrapper::vcgt(b, a); - break; - case ComparisonOperation::LessEqual: - res = wrapper::vcge(b, a); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - return res; -} - -template -inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) -{ - InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); - return elementwise_comp_op(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); -} - -template -inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - wrapper::vstore(output_ptr + x, a); - } - return x; -} - -template -inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - wrapper::vstore(output_ptr + x, wrapper::vmovn(a)); - } - return x; -} - -template -inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); - const auto b = elementwise_comp_op_broadcast(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); - wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b)))); - } - if(x <= window_end_x - 4) - { - const auto a = elementwise_comp_op_broadcast(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - for(int i = 0; i < 4; i++) - { - *(output_ptr + x + i) = wrapper::vgetlane(a, i); - } - x = +4; - } - return x; -} - -template -inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op(a, b); - wrapper::vstore(output_ptr + x, res); - } - return x; -} - -template -inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op(a, b); - wrapper::vstore(output_ptr + x, wrapper::vmovn(res)); - } - return x; -} - -template -inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto a = wrapper::vloadq(input1_ptr + x); - auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op(a, b); - a = wrapper::vloadq(input1_ptr + x + 4); - b = wrapper::vloadq(input2_ptr + x + 4); - const auto res2 = elementwise_comp_op(a, b); - wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2)))); - } - if(x <= window_end_x - 4) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - const auto res = elementwise_comp_op(a, b); - for(int i = 0; i < 4; i++) - { - *(output_ptr + x + i) = wrapper::vgetlane(res, i); - } - x = +4; - } - return x; -} - -template -void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op(in1, in2, out, window, - &elementwise_comp_op_scalar, - &elementwise_comp_op_broadcast_8_loop, - &elementwise_comp_op_8_loop); -} - -template -void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op(in1, in2, out, window, - &elementwise_comp_op_scalar, - &elementwise_comp_op_broadcast_16_loop, - &elementwise_comp_op_16_loop); -} - -template -void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op(in1, in2, out, window, - &elementwise_comp_op_scalar, - &elementwise_comp_op_broadcast_32_loop, - &elementwise_comp_op_32_loop); -} -} // namesapce cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H */ \ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h deleted file mode 100644 index 1ff4632f5c..0000000000 --- a/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h +++ /dev/null @@ -1,654 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H -#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H - -#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h" - -namespace arm_compute -{ -namespace cpu -{ -float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) -{ - qasymm8x16_t x = vld1q_u8(input1_ptr); - const float32x4x4_t out = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), - } - }; - return out; -} - -float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) -{ - qasymm8x16_signed_t x = vld1q_s8(input1_ptr); - const float32x4x4_t out = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), - } - }; - return out; -} - -void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out) -{ - const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1]))); - const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3]))); - vst1q_u8(output_ptr, vcombine_u8(pa, pb)); -} - -void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) -{ - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); - vst1q_u8(output_ptr, vcombine_u8(pa, pb)); -} - -void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) -{ - int32x4x4_t out = - { - { - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), - } - }; - store_quantized(output_ptr, out); -} - -void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) -{ - const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); - const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); - vst1q_s8(output_ptr, vcombine_s8(pa, pb)); -} - -void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) -{ - int32x4x4_t out = - { - { - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), - } - }; - store_quantized_signed(output_ptr, out); -} - -template -inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) -{ - return quantize_qasymm8(elementwise_arithm_op_scalar(a, b), qinfo); -} - -template -inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) -{ - return quantize_qasymm8_signed(elementwise_arithm_op_scalar(a, b), qinfo); -} - -template -inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b) -{ - using neon_vector_float = wrapper::traits::neon_vector; - float32x4x4_t out = - { - { - elementwise_arithm_op(a.val[0], b.val[0]), - elementwise_arithm_op(a.val[1], b.val[1]), - elementwise_arithm_op(a.val[2], b.val[2]), - elementwise_arithm_op(a.val[3], b.val[3]), - } - }; - return out; -} - -template -inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) -{ - ARM_COMPUTE_UNUSED(qinfo); - return elementwise_comp_op_scalar(a, b); -} - -template -inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b) -{ - uint32x4x4_t out = - { - { - elementwise_comp_op(a.val[0], b.val[0]), - elementwise_comp_op(a.val[1], b.val[1]), - elementwise_comp_op(a.val[2], b.val[2]), - elementwise_comp_op(a.val[3], b.val[3]) - } - }; - return out; -} - -template -inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get inputs and compute output - const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); - const float32x4x4_t rf = elementwise_arithm_op(af, bf); - store_quantized(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} - -template -inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get inputs and compute output - const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); - const float32x4x4_t rf = elementwise_arithm_op(af, bf); - store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} - -template -inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} -template -inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) -{ - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} - -template -inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); - const uint32x4x4_t rf = elementwise_comp_op(af, bf); - store_quantized(output_ptr + x, rf); - } - return x; -} - -template -inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); - const uint32x4x4_t rf = elementwise_comp_op(af, bf); - store_quantized(output_ptr + x, rf); - } - return x; -} - -template -inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const uint32x4x4_t rf = elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized(output_ptr + x, rf); - } - return x; -} - -template -inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const uint32x4x4_t rf = elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized(output_ptr + x, rf); - } - return x; -} - -void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); - - // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero) - const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - - if(is_broadcast_across_x) - { - // Select the broadcast input on the X axis - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); - - // Input1 quantization info - const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); - - // Input2 quantization info - const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); - } -} - -void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); - - const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - - if(is_broadcast_across_x) - { - // Select the broadcast input on the X axis - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); - - // Input1 quantization info - const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); - - // Input2 quantization info - const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); - } -} - -void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); - - const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - - if(is_broadcast_across_x) - { - // Select the broadcast input on the X axis - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); - - // Input1 quantization info - const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); - - // Input2 quantization info - const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); - } -} - -template -void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar, - &elementwise_arithm_op_quantized_broadcast_loop, - &elementwise_arithm_op_quantized_loop); -} -template -void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar, - &elementwise_arithm_op_quantized_signed_broadcast_loop, - &elementwise_arithm_op_quantized_singed_loop); -} - -template -void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, - &elementwise_comp_op_quantized_broadcast_loop, - &elementwise_comp_op_quantized_loop); -} - -template -void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, - &elementwise_comp_op_quantized_signed_broadcast_loop, - &elementwise_comp_op_quantized_signed_loop); -} -} // namespace cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h deleted file mode 100644 index 307e95fae9..0000000000 --- a/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H -#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H - -#include "arm_compute/core/Types.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" - -namespace arm_compute -{ -namespace cpu -{ -template -inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a) -{ - switch(op) - { - case ElementWiseUnary::RSQRT: - return 1 / sqrt(a); - case ElementWiseUnary::EXP: - return std::exp(a); - case ElementWiseUnary::NEG: - return -a; - case ElementWiseUnary::LOG: - return std::log(a); - case ElementWiseUnary::ABS: - return std::abs(a); - case ElementWiseUnary::ROUND: - return support::cpp11::nearbyint(a); - case ElementWiseUnary::SIN: - return std::sin(a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } -} - -template -inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::RSQRT: - return wrapper::vinvsqrt(a); - case ElementWiseUnary::EXP: - return wrapper::vexpq(a); - case ElementWiseUnary::NEG: - return wrapper::vneg(a); - case ElementWiseUnary::LOG: - return wrapper::vlog(a); - case ElementWiseUnary::ABS: - return wrapper::vabs(a); - case ElementWiseUnary::ROUND: - return wrapper::vround(a); - case ElementWiseUnary::SIN: - return wrapper::vsin(a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } -} - -template -void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) -{ - const int window_step_x = 16 / sizeof(ScalarType); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(output_ptr + x, elementwise_op_imp(op, wrapper::vloadq(input_ptr + x))); - } - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); - } - }, - input, output); -} - -} // namespace cpu -} // namespace arm_compute - -#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H \ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp deleted file mode 100644 index 58ebb28fe5..0000000000 --- a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Types.h" -#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::wrapper; - -template -struct LoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - const InputScalarType *input2_ptr; - OutputScalarType *output_ptr; -}; - -template -struct BroadcastLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - InputScalarType broadcast_value; - OutputScalarType *output_ptr; - bool reorder; -}; - -template -void arithmetic_op_loop(svbool_t pg, const LoopArguments &args) -{ - const auto in1 = svld1(pg, args.input1_ptr); - const auto in2 = svld1(pg, args.input2_ptr); - const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); - svst1(pg, args.output_ptr, res); -} - -template -void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) -{ - const auto non_broadcast_vector = svld1(pg, args.input1_ptr); - const auto broadcast_vector = svdup_n(args.broadcast_value); - const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; - const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; - const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); - svst1(pg, args.output_ptr, res); -} - -template -void comparison_op_loop(svbool_t pg, const LoopArguments &args) -{ - const auto in1 = svld1(pg, args.input1_ptr); - const auto in2 = svld1(pg, args.input2_ptr); - const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); - const svbool_t output_pg = narrow_to_byte_predicate(pg); - svst1(output_pg, args.output_ptr, res); -} - -template -void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) -{ - const auto non_broadcast_vector = svld1(pg, args.input1_ptr); - const auto broadcast_vector = svdup_n(args.broadcast_value); - const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; - const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; - const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); - const svbool_t output_pg = narrow_to_byte_predicate(pg); - svst1(output_pg, args.output_ptr, res); -} - -template -using LoopFuncType = void (*)(svbool_t, const LoopArguments &); - -template -using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments &); - -template ::type, - typename OutputScalarType = typename sve_scalar::type> -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OperatorType op, - LoopFuncType func, - BroadcastLoopFuncType broadcast_func) -{ - const auto all_true_pg = svptrue(); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - int x = window_start_x; - - svbool_t pg = svwhilelt(x, window_end_x); - do - { - broadcast_func(pg, - { - op, - non_broadcast_input_ptr + x, - broadcast_value, - output_ptr + x, - !is_broadcast_input_2 - }); - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - - int x = window_start_x; - - svbool_t pg = svwhilelt(x, window_end_x); - do - { - func(pg, - { - op, - input1_ptr + x, - input2_ptr + x, - output_ptr + x - }); - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} - -template -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using VectorType = typename sve_vector::type; - - elementwise_op(in1, in2, out, window, op, - &arithmetic_op_loop, - &arithmetic_op_broadcast_loop); -} - -template -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename sve_vector::type; - using OutputVectorType = typename sve_vector::type; - - elementwise_op(in1, in2, out, window, op, - &comparison_op_loop, - &comparison_op_broadcast_loop); -} - -template <> -svint32_t elementwise_pow(svbool_t &pg, const svint32_t &a, const svint32_t &b) -{ - return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); -} - -template <> -svint32_t elementwise_div(svbool_t &pg, const svint32_t &a, const svint32_t &b) -{ - return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); -} - -template <> -svint16_t elementwise_div(svbool_t &pg, const svint16_t &a, const svint16_t &b) -{ - ARM_COMPUTE_UNUSED(pg, a, b); - ARM_COMPUTE_ERROR("Not supported"); -} - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h deleted file mode 100644 index fea38d2995..0000000000 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H -#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/NEON/wrapper/svtraits.h" -#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::wrapper; - -template -VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b) -{ - return svpow_z(pg, a, b); -} - -template -VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b) -{ - return svdiv_z(pg, a, b); -} - -template -svbool_t narrow_to_byte_predicate(svbool_t pg) -{ - const auto all_false = svpfalse(); - - switch(bytewidth) - { - case 8: - pg = svuzp1_b32(pg, all_false); - /* fall through */ - case 4: - pg = svuzp1_b16(pg, all_false); - /* fall through */ - case 2: - pg = svuzp1_b8(pg, all_false); - /* fall through */ - default: - break; - } - return pg; -} - -template -VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op) -{ - using ScalarType = typename wrapper::sve_scalar::type; - VectorType res{}; - - switch(op) - { - case ArithmeticOperation::MAX: - res = svmax_z(pg, a, b); - break; - case ArithmeticOperation::MIN: - res = svmin_z(pg, a, b); - break; - case ArithmeticOperation::SQUARED_DIFF: - { - const auto tmp = svsub_z(pg, a, b); - res = svmul_z(pg, tmp, tmp); - break; - } - case ArithmeticOperation::PRELU: - { - const auto zero = svdup_n(ScalarType(0)); - const auto tmp = svmul_z(pg, a, b); - const auto gt = svcmpgt(pg, a, zero); - res = svsel(gt, a, tmp); - break; - } - case ArithmeticOperation::DIV: - { - res = elementwise_div(pg, a, b); - break; - } - case ArithmeticOperation::POWER: - { - res = elementwise_pow(pg, a, b); - break; - } - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - return res; -} - -template -OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) -{ - svbool_t selection_vector{}; - - switch(op) - { - case ComparisonOperation::Equal: - selection_vector = svcmpeq(pg, a, b); - break; - case ComparisonOperation::NotEqual: - selection_vector = svcmpne(pg, a, b); - break; - case ComparisonOperation::Greater: - selection_vector = svcmpgt(pg, a, b); - break; - case ComparisonOperation::GreaterEqual: - selection_vector = svcmpge(pg, a, b); - break; - case ComparisonOperation::Less: - selection_vector = svcmplt(pg, a, b); - break; - case ComparisonOperation::LessEqual: - selection_vector = svcmple(pg, a, b); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - - using InputScalarType = typename wrapper::sve_scalar::type; - selection_vector = narrow_to_byte_predicate(selection_vector); - - using OutputScalarType = typename wrapper::sve_scalar::type; - const auto false_vector = svdup_n(static_cast((uint32_t)0)); - const auto true_vector = svdup_n(static_cast(~(uint32_t)0)); - auto ret = svsel(selection_vector, true_vector, false_vector); - - return ret; -} - -template -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); - -template -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} // namespace cpu -} // namespace arm_compute -#endif // defined(ARM_COMPUTE_ENABLE_SVE) -#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */ diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h deleted file mode 100644 index 5e04128b44..0000000000 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h +++ /dev/null @@ -1,366 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H -#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H - -#if defined(ARM_COMPUTE_ENABLE_SVE2) - -#include "src/core/NEON/wrapper/svtraits.h" -#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::wrapper; - -template -struct QuantizedLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - const InputScalarType *input2_ptr; - OutputScalarType *output_ptr; - - const svint32_t &in1_offset; - const svint32_t &in2_offset; - const svint32_t &out_offset; - const svfloat32_t &in1_scale; - const svfloat32_t &in2_scale; - const svfloat32_t &out_scale; -}; - -template -struct BroadcastQuantizedLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - float broadcast_value; - OutputScalarType *output_ptr; - bool reorder; - - const svint32_t &in1_offset; - const svint32_t &out_offset; - const svfloat32_t &in1_scale; - const svfloat32_t &out_scale; -}; - -svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) -{ - auto x = svld1(pg, ptr); - - const auto widened = svcreate4( - svmovlb(svmovlb(x)), - svmovlt(svmovlb(x)), - svmovlb(svmovlt(x)), - svmovlt(svmovlt(x))); - - pg = svptrue_b8(); - - return svcreate4( - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); -} - -svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) -{ - auto x = svld1(pg, ptr); - - //vprint(x); - - const auto widened = svcreate4( - svmovlb(svmovlb(x)), - svmovlt(svmovlb(x)), - svmovlb(svmovlt(x)), - svmovlt(svmovlt(x))); - - pg = svptrue_b8(); - - return svcreate4( - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); -} - -void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) -{ - const auto quantized = svcreate4( - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); - - const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1)); - const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3)); - const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); - svst1(pg, ptr, narrowed); -} - -void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) -{ - const auto quantized = svcreate4( - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); - - const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1)); - const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3)); - const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); - - svst1(pg, ptr, narrowed); -} - -template -inline void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); - - const auto result = svcreate4( - elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), args.op), - elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), args.op), - elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), args.op), - elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), args.op)); - - store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); -} - -template -inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = svcreate4( - svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); - - const auto &af = args.reorder ? in2 : in1; - const auto &bf = args.reorder ? in1 : in2; - - const auto result = svcreate4( - elementwise_arithmetic_op(pg, svget4(af, 0), svget4(bf, 0), args.op), - elementwise_arithmetic_op(pg, svget4(af, 1), svget4(bf, 1), args.op), - elementwise_arithmetic_op(pg, svget4(af, 2), svget4(bf, 2), args.op), - elementwise_arithmetic_op(pg, svget4(af, 3), svget4(bf, 3), args.op)); - - store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); -} - -template -inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); - - using OutputVectorType = typename wrapper::traits::sve_vector::type; - - const auto result = svcreate4( - elementwise_comparison_op(pg, svget4(in1, 0), svget4(in2, 0), args.op), - elementwise_comparison_op(pg, svget4(in1, 1), svget4(in2, 1), args.op), - elementwise_comparison_op(pg, svget4(in1, 2), svget4(in2, 2), args.op), - elementwise_comparison_op(pg, svget4(in1, 3), svget4(in2, 3), args.op)); - - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, args.output_ptr, zipped); -} - -template -inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = svcreate4( - svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); - - const auto &af = args.reorder ? in2 : in1; - const auto &bf = args.reorder ? in1 : in2; - - using OutputVectorType = typename wrapper::traits::sve_vector::type; - - const auto result = svcreate4( - elementwise_comparison_op(pg, svget4(af, 0), svget4(bf, 0), args.op), - elementwise_comparison_op(pg, svget4(af, 1), svget4(bf, 1), args.op), - elementwise_comparison_op(pg, svget4(af, 2), svget4(bf, 2), args.op), - elementwise_comparison_op(pg, svget4(af, 3), svget4(bf, 3), args.op)); - - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, args.output_ptr, zipped); -} - -template -using LoopQuantizedFuncType = void (*)(svbool_t, const QuantizedLoopArguments &); - -template -using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments &); - -template ::type, - typename OutputScalarType = typename wrapper::sve_scalar::type> -void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OperatorType op, - LoopQuantizedFuncType func, - BroadcastQuantizedLoopFuncType broadcast_func) -{ - const auto all_true_pg = wrapper::svptrue(); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); - const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); - const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); - - const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); - const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto args = BroadcastQuantizedLoopArguments - { - op, - non_broadcast_input_ptr + x, - Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo), - output_ptr + x, - !is_broadcast_input_2, - non_broadcast_voffset, output_voffset, - non_broadcast_vscale, output_vscale - }; - broadcast_func(pg, args); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset); - const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale); - - const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); - const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto args = QuantizedLoopArguments - { - op, - input1_ptr + x, - input2_ptr + x, - output_ptr + x, - in1_voffset, in2_voffset, output_voffset, - in1_vscale, in2_vscale, output_vscale - }; - func(pg, args); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} - -template -void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using VectorType = typename wrapper::traits::sve_vector::type; - elementwise_quantized_op(in1, in2, out, window, op, - &arithmetic_op_quantized_loop, - &arithmetic_op_broadcast_quantized_loop); -} - -template -void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename wrapper::traits::sve_vector::type; - using OutputVectorType = typename wrapper::traits::sve_vector::type; - elementwise_quantized_op(in1, in2, out, window, op, - &comparison_op_quantized_loop, - &comparison_op_broadcast_quantized_loop); -} -} // namespace cpu -} // namespace arm_compute - -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ -#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ \ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp deleted file mode 100644 index ddf1febd66..0000000000 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -template -inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::RSQRT: - return svinvsqrt(pg, a); - case ElementWiseUnary::EXP: - return wrapper::svexp_z(pg, a); - case ElementWiseUnary::NEG: - return svneg_z(pg, a); - case ElementWiseUnary::LOG: - return wrapper::svlog_z(pg, a); - case ElementWiseUnary::ABS: - return svabs_z(pg, a); - case ElementWiseUnary::ROUND: - return svrintn_z(pg, a); - case ElementWiseUnary::SIN: - return wrapper::svsin_z(pg, a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED"); - } -} - -template -inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::NEG: - return svneg_z(pg, a); - case ElementWiseUnary::ABS: - return svabs_z(pg, a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED"); - } -} - -template -void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) -{ - const auto all_true_pg = wrapper::svptrue(); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto vin = svld1(pg, input_ptr + x); - svst1(pg, output_ptr + x, elementwise_op_sve_imp(pg, op, vin)); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input, output); -} - -template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); -template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); -template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h deleted file mode 100644 index c2b495f27c..0000000000 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H -#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H - -#include "arm_compute/core/Types.h" -#if defined(ARM_COMPUTE_ENABLE_SVE) - -namespace arm_compute -{ -namespace cpu -{ -template -void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); -} // namespace cpu -} // namespace arm_compute -#endif // defined(ARM_COMPUTE_ENABLE_SVE) -#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H \ No newline at end of file diff --git a/src/core/cpu/kernels/floor/list.h b/src/core/cpu/kernels/floor/list.h deleted file mode 100644 index 4367e0ffc9..0000000000 --- a/src/core/cpu/kernels/floor/list.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_FLOOR_LIST_H -#define SRC_CORE_NEON_KERNELS_FLOOR_LIST_H - -namespace arm_compute -{ -namespace cpu -{ -#define DECLARE_FLOOR_KERNEL(func_name) \ - void func_name(const void *src, void *dst, int len) - -DECLARE_FLOOR_KERNEL(fp16_neon_floor); -DECLARE_FLOOR_KERNEL(fp32_neon_floor); - -#undef DECLARE_FLOOR_KERNEL -} // namespace cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_FLOOR_LIST_H */ diff --git a/src/core/cpu/kernels/floor/neon/fp16.cpp b/src/core/cpu/kernels/floor/neon/fp16.cpp deleted file mode 100644 index f362676a36..0000000000 --- a/src/core/cpu/kernels/floor/neon/fp16.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) - -#include "src/common/utils/Validate.h" -#include "src/core/NEON/NEMath.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -constexpr int step = 8; - -void fp16_neon_floor(const void *src, void *dst, int len) -{ - ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); - ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - ARM_COMPUTE_ASSERT(len >= 0); - - auto psrc = static_cast(src); - auto pdst = static_cast<__fp16 *>(dst); - - for(; len >= step; len -= step) - { - vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc))); - psrc += step; - pdst += step; - } - - for(; len > 0; --len) - { - *pdst = std::floor(*psrc); - ++psrc; - ++pdst; - } -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/core/cpu/kernels/floor/neon/fp32.cpp b/src/core/cpu/kernels/floor/neon/fp32.cpp deleted file mode 100644 index f5efb2e849..0000000000 --- a/src/core/cpu/kernels/floor/neon/fp32.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/common/utils/Validate.h" -#include "src/core/NEON/NEMath.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -constexpr int step = 4; - -void fp32_neon_floor(const void *src, void *dst, int len) -{ - ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); - ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - ARM_COMPUTE_ASSERT(len >= 0); - - auto psrc = static_cast(src); - auto pdst = static_cast(dst); - - for(; len >= step; len -= step) - { - vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc))); - psrc += step; - pdst += step; - } - - for(; len > 0; --len) - { - *pdst = std::floor(*psrc); - ++pdst; - ++psrc; - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp deleted file mode 100644 index f5c63b763f..0000000000 --- a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" - -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/CPP/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/AssemblyUtils.h" - -#include "src/core/NEON/kernels/assembly/depthwise.hpp" - -#include "depthwise_common.hpp" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -using namespace arm_compute::misc::shape_calculator; - -namespace -{ -constexpr unsigned int idx_width = 1; -constexpr unsigned int idx_height = 2; -constexpr unsigned int idx_channels = 0; -constexpr unsigned int idx_batches = 3; - -template -void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, - const ConvolutionInfo &info, const CPUInfo &cpu_info, - std::unique_ptr &kernel) -{ - unsigned int stride_cols{}; - unsigned int stride_rows{}; - std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); - - const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); - - const unsigned int n_batches = src->dimension(idx_batches); - const unsigned int src_rows = src->dimension(idx_height); - const unsigned int src_cols = src->dimension(idx_width); - const unsigned int n_channels = src->dimension(idx_channels); - const unsigned int dst_rows = dst->dimension(idx_height); - const unsigned int dst_cols = dst->dimension(idx_width); - - const unsigned int kernel_cols = weights->dimension(idx_width); - const unsigned int kernel_rows = weights->dimension(idx_height); - - const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); - - arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, - n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, - padding, activation, nullptr); - - // Configure assembly pooling kernel - auto dwc_kernel_asm = arm_conv::depthwise::depthwise(args); - if(dwc_kernel_asm == nullptr) - { - // Configuration not supported: Leave function unconfigured: - return; - } - - kernel = std::move(dwc_kernel_asm); -} - -template -void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, - const ConvolutionInfo &info, const CPUInfo &cpu_info, - std::unique_ptr &kernel, - std::vector &multipliers, std::vector &right_shifts, std::vector &left_shifts) -{ - unsigned int stride_cols{}; - unsigned int stride_rows{}; - std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); - - const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); - - const unsigned int n_batches = src->dimension(idx_batches); - const unsigned int src_rows = src->dimension(idx_height); - const unsigned int src_cols = src->dimension(idx_width); - const unsigned int n_channels = src->dimension(idx_channels); - const unsigned int dst_rows = dst->dimension(idx_height); - const unsigned int dst_cols = dst->dimension(idx_width); - - const unsigned int kernel_cols = weights->dimension(idx_width); - const unsigned int kernel_rows = weights->dimension(idx_height); - - const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); - - arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, - n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, - padding, activation, nullptr); - - const auto src_qinfo = src->quantization_info().uniform(); - const auto weights_qinfo = weights->quantization_info(); - const auto dst_qinfo = dst->quantization_info().uniform(); - - const unsigned int num_filters = weights_qinfo.scale().size(); - - multipliers.resize(num_filters); - std::vector dst_shifts(num_filters); - quantization::compute_quantized_multipliers_and_shifts(src, - weights, - dst, - multipliers.data(), - dst_shifts.data()); - - // Quantize activation bounds - int32_t min_activation = std::numeric_limits::lowest(); - int32_t max_activation = std::numeric_limits::max(); - if(info.act_info.enabled()) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo); - } - - // Set quantization parameters for assembly kernels - arm_gemm::Requantize32 requant_args{}; - if(is_data_type_quantized_per_channel(weights->data_type())) - { - left_shifts.resize(num_filters); - right_shifts.resize(num_filters); - bool need_left_shift = false; // Select more optimized path if left shift is not needed - for(unsigned int i = 0; i < num_filters; ++i) - { - left_shifts[i] = std::max(-dst_shifts[i], static_cast(0)); - right_shifts[i] = std::min(-dst_shifts[i], static_cast(0)); - if(dst_shifts[i] < 0 && !need_left_shift) - { - need_left_shift = true; - } - } - - requant_args = arm_gemm::Requantize32(nullptr, - 0, - src_qinfo.offset, - weights_qinfo.uniform().offset, - dst_qinfo.offset, - (need_left_shift) ? left_shifts.data() : nullptr, - right_shifts.data(), - multipliers.data(), - static_cast(min_activation), - static_cast(max_activation)); - } - else - { - requant_args = arm_gemm::Requantize32(nullptr, - 0, - src_qinfo.offset, - weights_qinfo.uniform().offset, - dst_qinfo.offset, - -dst_shifts[0], - multipliers[0], - static_cast(min_activation), - static_cast(max_activation)); - } - - // Configure assembly pooling kernel with requantization - auto dwc_kernel_asm = arm_conv::depthwise::depthwise(args, requant_args); - if(dwc_kernel_asm == nullptr) - { - // Configuration not supported: Leave function unconfigured: - return; - } - - kernel = std::move(dwc_kernel_asm); -} -} // namespace - -CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel() - : _kernel_asm(nullptr), - _multipliers(), - _left_shifts(), - _right_shifts() -{ -} - -CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default; - -void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst, - const ConvolutionInfo &info, const CPUInfo &cpu_info) -{ - ARM_COMPUTE_UNUSED(cpu_info); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - - // Destination initialization if not yet initialized - const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info); - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); - -#if defined(__aarch64__) - switch(src->data_type()) - { - case DataType::QASYMM8: - if(is_data_type_quantized_per_channel(weights->data_type())) - { - create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts); - } - else - { - create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts); - } - break; - case DataType::QASYMM8_SIGNED: - create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts); - break; -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - case DataType::F16: - create_arm_dwc(src, weights, dst, info, cpu_info, _kernel_asm); - break; -#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - case DataType::F32: - create_arm_dwc(src, weights, dst, info, cpu_info, _kernel_asm); - break; - default: - break; - } -#endif // defined(__aarch64__) - - Window win = calculate_max_window(*dst, Steps()); - ICpuKernel::configure(win); -} - -Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - -#if !defined(__aarch64__) - ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); -#endif // !defined(__aarch64__) - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)"); - - if(is_data_type_quantized_per_channel(weights->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - } - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0)); - - if(is_data_type_quantized(src->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - } - } - - if(dst->total_size() > 0) - { - const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - return Status{}; -} - -void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get()); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_UNUSED(window); - ARM_COMPUTE_UNUSED(info); - - ARM_COMPUTE_ERROR_ON(tensors.empty()); - - const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0); - ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1); - - const auto src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); - auto dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); - auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); - auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes(); - - const auto src_shape = src->info()->tensor_shape(); - const auto dst_shape = dst->info()->tensor_shape(); - const auto src_padding = src->info()->padding(); - const auto dst_padding = dst->info()->padding(); - - const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right; - const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom); - const size_t ld_src_batch = ld_src_row * src_shape[2]; - const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right; - const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); - const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; - - _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, - parameters_ptr, - dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, - working_space, info.thread_id, info.num_threads); -} - -void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row) -{ - _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row); -} - -size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_storage_size() const -{ - return _kernel_asm->get_storage_size(); -} - -size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads, unsigned int num_input_channels) const -{ - return _kernel_asm->get_working_size(num_threads, num_input_channels); -} - -bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const -{ - return _kernel_asm != nullptr; -} - -const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const -{ - return "CpuDepthwiseConv2dAssemblyWrapperKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h deleted file mode 100644 index 8ff44441e9..0000000000 --- a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H -#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_conv -{ -namespace depthwise -{ -// Forward declarations -class IDepthwiseCommon; -} // depthwise -} // arm_conv - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** This class is a wrapper for the depthwise convolution assembly kernels. */ -class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel -{ -public: - /** Default constructor */ - CpuDepthwiseConv2dAssemblyWrapperKernel(); - ~CpuDepthwiseConv2dAssemblyWrapperKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyWrapperKernel); - - /** Initialise the kernel's src and dst. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[in] bias Bias tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: same as @p src, S32 when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[out] dst Destination tensor info. Data type supported: same as @p input. - * @param[in] info Depthwise convolution layer meta-data. - * @param[in] cpu_info CPU information needed to select the most appropriate kernel. - */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info); - - /** Indicates whether or not this function can be used to process the given parameters. - * - * Similar to @ref CpuDepthwiseConv2dAssemblyWrapperKernel::configure() - * - * @return a status. - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - - /** Pack bias and weights in a storage space for the assembly kernel - * - * @param[in] parameters_ptr Pointer to storage space. - * @param[in] bias_ptr Pointer to bias buffer. - * @param[in] weights_ptr Pointer to weights buffer. - * @param[in] ld_weights_col Columns displacement for the weights tensor. - * @param[in] ld_weights_row Rows displacement for the weights tensor. - */ - void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row); - - /** Get the amount of storage space required for the rearranged weights and bias. - * - * @return size of workspace - */ - size_t get_storage_size() const; - - /** Get size of the workspace needed by the assembly kernel. - * - * @param[in] num_threads Maximum number of threads that are going to be spawned. - * @param[in] num_input_channels Number of channels of the input tensor. - * - * @return size of workspace - */ - size_t get_working_size(unsigned int num_threads, unsigned int num_input_channels) const; - - /** Was the asm kernel successfully configured? - * - * @return True if the asm kernel is configured and ready to run - */ - bool is_configured() const; - -private: - std::unique_ptr _kernel_asm; - std::vector _multipliers{}; - std::vector _left_shifts{}; - std::vector _right_shifts{}; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H */ diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp deleted file mode 100644 index 89dd27a20a..0000000000 --- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/INEKernel.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -using namespace arm_compute::misc::shape_calculator; - -void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) -{ - ARM_COMPUTE_UNUSED(cpu_info); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // dst initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info))); - -#if defined(__aarch64__) - const bool requantize = src->quantization_info() != dst->quantization_info(); - - switch(src->data_type()) - { - case DataType::QASYMM8: - if(requantize) - { - create_arm_pooling_requant(src, dst, info, cpu_info); - } - else - { - create_arm_pooling(src, dst, info, cpu_info); - } - break; - case DataType::QASYMM8_SIGNED: - if(requantize) - { - create_arm_pooling_requant(src, dst, info, cpu_info); - } - else - { - create_arm_pooling(src, dst, info, cpu_info); - } - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - create_arm_pooling(src, dst, info, cpu_info); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - create_arm_pooling(src, dst, info, cpu_info); - break; - default: - break; - } -#endif // defined(__aarch64__) - - Window win = calculate_max_window(*dst, Steps()); - INEKernel::configure(win); -} - -Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - -#ifndef __aarch64__ - ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); -#endif /* __aarch64__ */ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX), - "Only AVG and MAX pooling are supported by assembly kernels"); - - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - - const auto src_qinfo = src->quantization_info().uniform(); - const auto dst_qinfo = dst->quantization_info().uniform(); - - if(src_qinfo != dst_qinfo) - { - const float multiplier = src_qinfo.scale / dst_qinfo.scale; - int32_t dst_multiplier{}; - int32_t dst_shift{}; - ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift)); - } - else - { - if(src->data_type() == DataType::QASYMM8) - { - const bool has_padding = info.pad_stride_info.has_padding(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); - } - } - } - else - { - if(src->data_type() == DataType::QASYMM8) - { - // If dst is not configured, the quantization info are the same - const bool has_padding = info.pad_stride_info.has_padding(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); - } - } - return Status{}; -} - -void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get()); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_UNUSED(window); - ARM_COMPUTE_UNUSED(info); - - ARM_COMPUTE_ERROR_ON(tensors.empty()); - - const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); - ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0); - - const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); - auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); - auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); - - const auto src_shape = src->info()->tensor_shape(); - const auto dst_shape = dst->info()->tensor_shape(); - const auto src_padding = src->info()->padding(); - const auto dst_padding = dst->info()->padding(); - - const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right; - const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom); - const size_t ld_src_batch = ld_src_row * src_shape[2]; - const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right; - const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); - const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; - - _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, - out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, - working_space, info.thread_id, info.num_threads); -} - -size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const -{ - return _kernel_asm->get_working_size(num_threads); -} - -bool CpuPool2dAssemblyWrapperKernel::is_configured() const -{ - return _kernel_asm != nullptr; -} - -template -void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) -{ - const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX; - - arm_conv::pooling::PoolingWindow window{}; - window.cols = static_cast(info.pool_size.x()); - window.rows = static_cast(info.pool_size.y()); - - arm_conv::pooling::PoolingStride stride{}; - std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); - - const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; - - constexpr unsigned int idx_width = 1; - constexpr unsigned int idx_height = 2; - constexpr unsigned int idx_channels = 0; - constexpr unsigned int idx_batches = 3; - - const unsigned int n_batches = src->dimension(idx_batches); - const unsigned int src_rows = src->dimension(idx_height); - const unsigned int src_cols = src->dimension(idx_width); - const unsigned int n_channels = src->dimension(idx_channels); - const unsigned int dst_rows = dst->dimension(idx_height); - const unsigned int dst_cols = dst->dimension(idx_width); - - arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); - - // Configure assembly pooling kernel - auto pooling_kernel_asm = arm_conv::pooling::pooling(args); - if(pooling_kernel_asm == nullptr) - { - // Configuration not supported: Leave function unconfigured: - return; - } - - _kernel_asm = std::move(pooling_kernel_asm); -} - -template -void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) -{ - const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX; - - arm_conv::pooling::PoolingWindow window{}; - window.cols = static_cast(info.pool_size.x()); - window.rows = static_cast(info.pool_size.y()); - - arm_conv::pooling::PoolingStride stride{}; - std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); - - const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; - - constexpr unsigned int idx_width = 1; - constexpr unsigned int idx_height = 2; - constexpr unsigned int idx_channels = 0; - constexpr unsigned int idx_batches = 3; - - const unsigned int n_batches = src->dimension(idx_batches); - const unsigned int src_rows = src->dimension(idx_height); - const unsigned int src_cols = src->dimension(idx_width); - const unsigned int n_channels = src->dimension(idx_channels); - const unsigned int dst_rows = dst->dimension(idx_height); - const unsigned int dst_cols = dst->dimension(idx_width); - - arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); - - const auto src_qinfo = src->quantization_info().uniform(); - const auto dst_qinfo = dst->quantization_info().uniform(); - - const float multiplier = src_qinfo.scale / dst_qinfo.scale; - int32_t dst_multiplier{}; - int32_t dst_shift{}; - quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift); - - const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, - dst_qinfo.offset, - dst_shift, // left shift - 0, // right shift - dst_multiplier); - - // Configure assembly pooling kernel with requantization - auto pooling_kernel_asm = arm_conv::pooling::pooling(args, requant_args); - if(pooling_kernel_asm == nullptr) - { - // Configuration not supported: Leave function unconfigured: - return; - } - - _kernel_asm = std::move(pooling_kernel_asm); -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h deleted file mode 100644 index 3afa4c16a4..0000000000 --- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H -#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/NEON/kernels/assembly/pooling.hpp" -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -#include "pool_common.hpp" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** This class is a wrapper for the assembly kernels. - * - * Some kernels were written in assembly and highly optimised for specific - * CPUs like A53 or A55. The arm compute library creates an instance of - * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to - * execute a single assembly kernel in the context of an NEFunction. - * - */ -class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel -{ -public: - /** Constructor - */ - CpuPool2dAssemblyWrapperKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel); - - const char *name() const override - { - return "CpuPool2dAssemblyWrapperKernel"; - } - - /** Initialise the kernel's src and dst. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info to store the result of pooling. Data types supported: same as @p src. - * @param[in] info Pooling meta-data. - * @param[in] cpu_info CPU information needed to select the most appropriate kernel. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuPool2dAssemblyWrapperKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - - /** Get size of the workspace needed by the assembly kernel. - * - * @param[in] num_threads Maximum number of threads that are going to be spawned. - * - * @return size of workspace - */ - size_t get_working_size(unsigned int num_threads) const; - - /** Was the asm kernel successfully configured? - * - * @return True if the asm kernel is configured and ready to run - */ - bool is_configured() const; - -private: - /** Helper function to create the assembly kernel. - * - * @param[in] src Source tensor info. - * @param[in] dst Destination tensor info. - * @param[in] info Pooling layer meta-data. - */ - template - void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); - - /** Helper function to create the assembly kernel with requantization support - * - * @param[in] src Source tensor info. - * @param[in] dst Destination tensor info. - * @param[in] info Pooling layer meta-data. - */ - template - void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); - - std::unique_ptr _kernel_asm{ nullptr }; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */ diff --git a/src/core/cpu/kernels/pool2d/neon/fp16.cpp b/src/core/cpu/kernels/pool2d/neon/fp16.cpp deleted file mode 100644 index d21e153f25..0000000000 --- a/src/core/cpu/kernels/pool2d/neon/fp16.cpp +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/cpu/kernels/pool2d/neon/list.h" -#include "src/core/helpers/WindowHelpers.h" - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 8; - - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, window_src); - Iterator out(dst0, window_out); - Iterator indices(dst1, window_out); - - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - - const int pad_right = src->info()->padding().right; - const int pad_left = src->info()->padding().left; - const int pad_horizontal = pad_right + pad_left; - const int in_stride_y = static_cast(src->info()->strides_in_bytes().y()); - const int in_stride_z = static_cast(src->info()->strides_in_bytes().z()); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); - const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - const auto in_x0_ptr = reinterpret_cast(in.ptr() + in_x0_offset) + x_off; - const auto in_x1_ptr = reinterpret_cast(in.ptr() + in_x1_offset) + x_off; - const auto in_x2_ptr = reinterpret_cast(in.ptr() + in_x2_offset) + x_off; - const auto in_x3_ptr = reinterpret_cast(in.ptr() + in_x3_offset) + x_off; - const auto v_x0 = vld1q_f16(in_x0_ptr); - const auto v_x1 = vld1q_f16(in_x1_ptr); - const auto v_x2 = vld1q_f16(in_x2_ptr); - const auto v_x3 = vld1q_f16(in_x3_ptr); - float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1)); - // Store result - vst1q_f16(reinterpret_cast(out.ptr()) + x_off, vres); - - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; - const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 }; - const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 }; - const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1)); - const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 }; - const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 }; - const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1)); - const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 }; - const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 }; - const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1)); - const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 }; - const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 }; - const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1)); - const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1); - const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3); - const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1); - const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2)); - const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2)); - // Store indicies - vst1q_u32(reinterpret_cast(indices.ptr()) + x_off, tmp_indeces3_0); - vst1q_u32(reinterpret_cast(indices.ptr() + 16) + x_off, tmp_indeces3_1); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - const auto x0 = *(reinterpret_cast(in.ptr() + in_x0_offset) + x_off); - const auto x1 = *(reinterpret_cast(in.ptr() + in_x1_offset) + x_off); - const auto x2 = *(reinterpret_cast(in.ptr() + in_x2_offset) + x_off); - const auto x3 = *(reinterpret_cast(in.ptr() + in_x3_offset) + x_off); - float16_t res = std::max(std::max(x2, x3), std::max(x0, x1)); - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; - const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; - const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; - const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; - - // Store indices - *(reinterpret_cast(indices.ptr()) + x_off) = tmp_idx2; - } - }, - in, out, indices); -} -} - -void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1) - { - pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); - } - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 8; - - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, window_src); - Iterator out(dst0, window_out); - - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - float16x8_t vres; - - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float16x8_t scale_v = vdupq_n_f16(scale); - - // Perform pooling - vres = vdupq_n_f16(0.0f); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float16x8_t data = vld1q_f16(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) - { - vres = vaddq_f16(vres, vmulq_f16(data, data)); - } - else - { - vres = vaddq_f16(vres, data); - } - } - } - // Divide by scale - vres = vmulq_f16(vres, scale_v); - } - else - { - vres = vdupq_n_f16(std::numeric_limits::lowest()); - - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float16x8_t data = vld1q_f16(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - vres = vmaxq_f16(vres, data); - } - } - } - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres); - vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal)); - } - - // Store result - vst1q_f16(reinterpret_cast(out.ptr()) + x_off, vres); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - float16_t res = 0.0f; - - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - const float16_t scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) - { - res += data * data; - } - else - { - res += data; - } - } - } - - // Divide by scale - res *= scale; - } - else - { - res = std::numeric_limits::lowest(); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float16_t data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - res = std::max(res, data); - } - } - } - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); - } - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - in, out); -} -} // namespace cpu -} // namespace arm_compute - -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/pool2d/neon/fp32.cpp b/src/core/cpu/kernels/pool2d/neon/fp32.cpp deleted file mode 100644 index c82cad0ffd..0000000000 --- a/src/core/cpu/kernels/pool2d/neon/fp32.cpp +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/cpu/kernels/pool2d/neon/list.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 4; - - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, window_src); - Iterator out(dst0, window_out); - Iterator indices(dst1, window_out); - - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - - float32x4_t vres; - float res; - - const int pad_right = src->info()->padding().right; - const int pad_left = src->info()->padding().left; - const int pad_horizontal = pad_right + pad_left; - const int in_stride_y = static_cast(src->info()->strides_in_bytes().y()); - const int in_stride_z = static_cast(src->info()->strides_in_bytes().z()); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - - const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().z()); - const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z()); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - const auto in_x0_ptr = reinterpret_cast(in.ptr() + in_x0_offset); - const auto in_x1_ptr = reinterpret_cast(in.ptr() + in_x1_offset); - const auto in_x2_ptr = reinterpret_cast(in.ptr() + in_x2_offset); - const auto in_x3_ptr = reinterpret_cast(in.ptr() + in_x3_offset); - const auto v_x0 = vld1q_f32(in_x0_ptr + x_off); - const auto v_x1 = vld1q_f32(in_x1_ptr + x_off); - const auto v_x2 = vld1q_f32(in_x2_ptr + x_off); - const auto v_x3 = vld1q_f32(in_x3_ptr + x_off); - vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1)); - // Store result - vst1q_f32(reinterpret_cast(out.ptr()) + x_off, vres); - - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off; - const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 }; - const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 }; - const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 }; - const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 }; - const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1); - const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3); - const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1); - - // Store indices - vst1q_u32(reinterpret_cast(indices.ptr()) + x_off, tmp_indices2); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - const auto x0 = *(reinterpret_cast(in.ptr() + in_x0_offset) + x_off); - const auto x1 = *(reinterpret_cast(in.ptr() + in_x1_offset) + x_off); - const auto x2 = *(reinterpret_cast(in.ptr() + in_x2_offset) + x_off); - const auto x3 = *(reinterpret_cast(in.ptr() + in_x3_offset) + x_off); - res = std::max(std::max(x2, x3), std::max(x0, x1)); - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off; - const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; - const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; - const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; - - // Store indices - *(reinterpret_cast(indices.ptr()) + x_off) = tmp_idx2; - } - }, - in, out, indices); -} -} - -void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1) - { - pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); - } - else - { - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 4; - - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, window_src); - Iterator out(dst0, window_out); - - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - float32x4_t vres; - - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x4_t scale_v = vdupq_n_f32(scale); - - // Perform pooling - vres = vdupq_n_f32(0.0f); - - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float32x4_t data = vld1q_f32(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) - { - vres = vmlaq_f32(vres, data, data); - } - else - { - vres = vaddq_f32(vres, data); - } - } - } - // Divide by scale - vres = vmulq_f32(vres, scale_v); - } - else - { - vres = vdupq_n_f32(std::numeric_limits::lowest()); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float32x4_t data = vld1q_f32(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - vres = vmaxq_f32(vres, data); - } - } - } - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - float32x4_t l2_res = { static_cast(sqrt(vgetq_lane_f32(vres, 0))), - static_cast(sqrt(vgetq_lane_f32(vres, 1))), - static_cast(sqrt(vgetq_lane_f32(vres, 2))), - static_cast(sqrt(vgetq_lane_f32(vres, 3))) - }; - vres = l2_res; - } - - // Store result - vst1q_f32(reinterpret_cast(out.ptr()) + x_off, vres); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - float res = 0.0f; - - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) - { - res += data * data; - } - else - { - res += data; - } - } - } - - // Divide by scale - res *= scale; - } - else - { - res = std::numeric_limits::lowest(); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - res = std::max(res, data); - } - } - } - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); - } - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - in, out); - } -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/pool2d/neon/list.h b/src/core/cpu/kernels/pool2d/neon/list.h deleted file mode 100644 index f1e23d43cf..0000000000 --- a/src/core/cpu/kernels/pool2d/neon/list.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_POOLING_LIST_H -#define SRC_CORE_NEON_KERNELS_POOLING_LIST_H - -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/cpu/kernels/pool2d/neon/quantized.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -#define DECLARE_POOLING_KERNEL(func_name) \ - void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window) - -DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc); -DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc); -DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nhwc); -DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nhwc); - -#if defined(ENABLE_NCHW_KERNELS) - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -DECLARE_POOLING_KERNEL(pooling2_fp16_neon_nchw); -DECLARE_POOLING_KERNEL(pooling3_fp16_neon_nchw); -DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nchw); -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ - -DECLARE_POOLING_KERNEL(pooling2_fp32_neon_nchw); -DECLARE_POOLING_KERNEL(pooling3_fp32_neon_nchw); -DECLARE_POOLING_KERNEL(pooling7_fp32_neon_nchw); -DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nchw); -#endif /* defined(ENABLE_NCHW_KERNELS) */ - -#undef DECLARE_POOLING_KERNEL - -template -inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout) -{ - const int pad_left = info.padding().left; - const int pad_right = info.padding().right; - const int pad_top = info.padding().top; - const int pad_bottom = info.padding().bottom; - const int in_stride_y = static_cast(info.strides_in_bytes().y()); - const int in_stride_w = static_cast(info.strides_in_bytes()[3]); - const int pad_horiz = pad_left + pad_right; - const int pad_vert = pad_top + pad_bottom; - - if(data_layout == DataLayout::NCHW) - { - const uint32_t offset_base = padded_offset - - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */ - - pad_top * sizeof(T) /* top padding */ - - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */ - - in_stride_w * id[3]; - - return offset_base; - } - else - { - const uint32_t offset_base = padded_offset - - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row - - pad_top * sizeof(T) // top padding - - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems - - in_stride_w * id[3]; - - return offset_base; - } -} -} // namespace cpu -} // namespace arm_compute - -#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H \ No newline at end of file diff --git a/src/core/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/core/cpu/kernels/pool2d/neon/nchw/all.cpp deleted file mode 100644 index bece438989..0000000000 --- a/src/core/cpu/kernels/pool2d/neon/nchw/all.cpp +++ /dev/null @@ -1,700 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/cpu/kernels/pool2d/neon/list.h" -#include "src/core/helpers/WindowHelpers.h" - -#ifdef ENABLE_NCHW_KERNELS -namespace arm_compute -{ -namespace cpu -{ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - ARM_COMPUTE_UNUSED(pool_info.pool_type); - ARM_COMPUTE_UNUSED(pool_info.exclude_padding); - - Iterator in(src, window_src); - Iterator out(dst0, window); - - constexpr const int pool_size = 3; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2)); - - execute_window_loop(window, [&](const Coordinates & id) - { - float16x4_t top_data = vld1_f16(reinterpret_cast(src_top_ptr + in.offset())); - float16x4_t middle_data = vld1_f16(reinterpret_cast(src_middle_ptr + in.offset())); - float16x4_t bottom_data = vld1_f16(reinterpret_cast(src_bottom_ptr + in.offset())); - float16x4_t res = {}; - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - top_data = vmul_f16(top_data, top_data); - middle_data = vmul_f16(middle_data, middle_data); - bottom_data = vmul_f16(bottom_data, bottom_data); - } - - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float16x4_t scale_v = vdup_n_f16(scale); - // Perform pooling - const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); - res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data); - res = vmul_f16(vpadd_f16(res, res), scale_v); - } - else - { - const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data); - res = vpmax_f16(vset_lane_f16(-std::numeric_limits::max(), max_data, 3), max_data); - res = vpmax_f16(res, res); - } - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = vinv_f16(vinvsqrt_f16(res)); - } - - *(reinterpret_cast(out.ptr())) = vget_lane_f16(res, 0); - }, - in, out); -} - -template -inline typename std::enable_if::value, float32x2_t>::type -f16_to_f32(float16x4_t in) -{ - float32x2_t out = { static_cast(vget_lane_f16(in, 0)), static_cast(vget_lane_f16(in, 1)) }; - return out; -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template -inline typename std::enable_if::value, float32x2_t>::type -f16_to_f32(float32x2_t in) -{ - return in; -} - -template -void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - Iterator in(src, window_src); - Iterator out(dst0, window); - Iterator indices(dst1, window); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - const int pad_left = src->info()->padding().left; - const int pad_right = src->info()->padding().right; - const int in_stride_y = static_cast(src->info()->strides_in_bytes().y()); - - execute_window_loop(window, [&](const Coordinates & id) - { - auto top_data = wrapper::vload(reinterpret_cast(src_top_ptr + in.offset())); - auto bottom_data = wrapper::vload(reinterpret_cast(src_bottom_ptr + in.offset())); - float32x2_t top_data_f32 = f16_to_f32(top_data); - float32x2_t bottom_data_f32 = f16_to_f32(bottom_data); - - // Calculate max data, compare top first, then bottom, to make sue the first max is recorded. - const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32); - const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32); - const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom); - *(reinterpret_cast(out.ptr())) = static_cast(vget_lane_f32(max_data, 0)); - - // Calculate max data indice, which will be used in max unpool. - const uint32_t offset_base = offset_no_padding(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW); - const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T)); - const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left; - const uint32x2_t voffset_top = { offset_top, offset_top + 1u }; - const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u }; - const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top)); - const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom)); - *(reinterpret_cast(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0); - }, - in, out, indices); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - if(pool_info.pool_type == PoolingType::MAX && dst1) - { - pooling2_nchw_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); - } - else - { - Iterator in(src, window_src); - Iterator out(dst0, window); - constexpr int pool_size = 2; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x, pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - - execute_window_loop(window, [&](const Coordinates & id) - { - float16x4_t top_data = vld1_f16(reinterpret_cast(src_top_ptr + in.offset())); - float16x4_t bottom_data = vld1_f16(reinterpret_cast(src_bottom_ptr + in.offset())); - float16x4_t res = {}; - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - top_data = vmul_f16(top_data, top_data); - bottom_data = vmul_f16(bottom_data, bottom_data); - } - - if(pool_info.pool_type != PoolingType::MAX) - { - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float16x4_t scale_v = vdup_n_f16(scale); - - const float16x4_t sum_data = vadd_f16(top_data, bottom_data); - res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v); - } - else - { - const float16x4_t max_data = vmax_f16(top_data, bottom_data); - res = vpmax_f16(max_data, max_data); - } - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = vinv_f16(vinvsqrt_f16(res)); - } - - // Store result - *(reinterpret_cast(out.ptr())) = vget_lane_f16(res, 0); - }, - in, out); - } -} - -void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - Iterator in(src, window_src); - Iterator out(dst0, window); - - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - execute_window_loop(window, [&](const Coordinates & id) - { - float16_t res = 0.0f; - float16x8_t vres = vdupq_n_f16(0.0f); - - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - - // Perform pooling - - for(int y = 0; y < pool_size_y; ++y) - { - int x = 0; - for(; x <= (pool_size_x - 8); x += 8) - { - const float16x8_t data = vld1q_f16(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) - { - vres = vaddq_f16(vres, vmulq_f16(data, data)); - } - else - { - vres = vaddq_f16(vres, data); - } - } - - // Leftover for loop - for(; x < pool_size_x; ++x) - { - float16_t data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y()))); - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - data *= data; - } - - res += data; - } - } - - // Reduction - float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres)); - res += vget_lane_f16(tmp, 0); - res += vget_lane_f16(tmp, 1); - res += vget_lane_f16(tmp, 2); - res += vget_lane_f16(tmp, 3); - - // Divide by scale - res *= scale; - } - else - { - float16x8_t vres = vdupq_n_f16(std::numeric_limits::lowest()); - res = std::numeric_limits::lowest(); - - for(int y = 0; y < pool_size_y; ++y) - { - int x = 0; - for(; x <= (pool_size_x - 8); x += 8) - { - const float16x8_t data = vld1q_f16(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - vres = vmaxq_f16(vres, data); - } - - // Leftover for loop - for(; x < pool_size_x; ++x) - { - const float16_t data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast(src->info()->strides_in_bytes().y()))); - res = std::max(res, data); - } - } - - float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres)); - res = std::max(res, vget_lane_f16(tmp, 0)); - res = std::max(res, vget_lane_f16(tmp, 1)); - res = std::max(res, vget_lane_f16(tmp, 2)); - res = std::max(res, vget_lane_f16(tmp, 3)); - } - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); - } - - // Store result - *(reinterpret_cast(out.ptr())) = res; - }, - in, out); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - Iterator in(src, window_src); - Iterator out(dst0, window); - - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - execute_window_loop(window, [&](const Coordinates & id) - { - float res = 0.0f; - - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - - // Perform pooling - float32x4_t vres = vdupq_n_f32(0.0f); - - for(int y = 0; y < pool_size_y; ++y) - { - int x = 0; - for(; x <= (pool_size_x - 4); x += 4) - { - const float32x4_t data = vld1q_f32(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) - { - vres = vmlaq_f32(vres, data, data); - } - else - { - vres = vaddq_f32(vres, data); - } - } - - // Leftover for loop - for(; x < pool_size_x; ++x) - { - float data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - data *= data; - } - - res += data; - } - } - -#if defined(__aarch64__) - // Reduction operation available on 64 bit architectures only - res += vaddvq_f32(vres); -#else // __aarch64__ - // Reduction - float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres)); - tmp = vpadd_f32(tmp, tmp); - - res += vget_lane_f32(tmp, 0); -#endif // __aarch64__ - // Divide by scale - res *= scale; - } - else - { - float32x4_t vres = vdupq_n_f32(std::numeric_limits::lowest()); - res = std::numeric_limits::lowest(); - - for(int y = 0; y < pool_size_y; ++y) - { - int x = 0; - for(; x <= (pool_size_x - 4); x += 4) - { - const float32x4_t data = vld1q_f32(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - vres = vmaxq_f32(vres, data); - } - - // Leftover for loop - for(; x < pool_size_x; ++x) - { - const float data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - res = std::max(res, data); - } - } -#if defined(__aarch64__) - // Reduction operation available on 64 bit architectures only - res = std::max(vmaxvq_f32(vres), res); -#else // __aarch64__ - float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres)); - tmp = vpmax_f32(tmp, tmp); - - res = std::max(res, vget_lane_f32(tmp, 0)); -#endif // __aarch64__ - } - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); - } - - // Store result - *(reinterpret_cast(out.ptr())) = res; - }, - in, out); -} - -void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - if(pool_info.pool_type == PoolingType::MAX && dst1) - { - pooling2_nchw_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); - } - else - { - Iterator in(src, window_src); - Iterator out(dst0, window); - constexpr int pool_size = 2; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_top_ptr = reinterpret_cast(src_top_ptr + in.offset()); - const auto in_bottom_ptr = reinterpret_cast(src_bottom_ptr + in.offset()); - float32x2_t top_data = vld1_f32(in_top_ptr); - float32x2_t bottom_data = vld1_f32(in_bottom_ptr); - float32x2_t res = {}; - float final_res = 0; - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - top_data = vmul_f32(top_data, top_data); - bottom_data = vmul_f32(bottom_data, bottom_data); - } - - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x2_t scale_v = vdup_n_f32(scale); - - // Perform pooling - const float32x2_t sum_data = vadd_f32(top_data, bottom_data); - res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v); - } - else - { - const float32x2_t max_data = vmax_f32(top_data, bottom_data); - res = vpmax_f32(max_data, max_data); - } - final_res = vget_lane_f32(res, 0); - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - final_res = sqrt(final_res); - } - - // Store result - *(reinterpret_cast(out.ptr())) = final_res; - }, - in, out); - } -} - -void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - Iterator in(src, window_src); - Iterator out(dst0, window); - - constexpr const int pool_size = 3; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2)); - - execute_window_loop(window, [&](const Coordinates & id) - { - float32x4_t top_data = vld1q_f32(reinterpret_cast(src_top_ptr + in.offset())); - float32x4_t middle_data = vld1q_f32(reinterpret_cast(src_middle_ptr + in.offset())); - float32x4_t bottom_data = vld1q_f32(reinterpret_cast(src_bottom_ptr + in.offset())); - float32x2_t res = {}; - float final_res = 0; - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - top_data = vmulq_f32(top_data, top_data); - middle_data = vmulq_f32(middle_data, middle_data); - bottom_data = vmulq_f32(bottom_data, bottom_data); - } - - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x2_t scale_v = vdup_n_f32(scale); - - // Perform pooling - const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data); - res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data)); - res = vmul_f32(vpadd_f32(res, res), scale_v); - } - else - { - const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data); - res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits::max(), max_data, 3)), vget_low_f32(max_data)); - res = vpmax_f32(res, res); - } - final_res = vget_lane_f32(res, 0); - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - final_res = sqrt(final_res); - } - - // Store result - *(reinterpret_cast(out.ptr())) = final_res; - }, - in, out); -} - -void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - Iterator in(src, window_src); - Iterator out(dst0, window); - - constexpr const int pool_size = 7; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - std::array src_ptrs{ {} }; - for(int i = 0; i < pool_size; ++i) - { - src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + i)); - } - - execute_window_loop(window, [&](const Coordinates & id) - { - float32x2_t res = {}; - float final_res = 0.f; - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x2_t scale_v = vdup_n_f32(scale); - - // Perform pooling - float32x4x2_t data = vld2q_f32(reinterpret_cast(src_ptrs[0] + in.offset())); - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - data.val[0] = vmulq_f32(data.val[0], data.val[0]); - data.val[1] = vmulq_f32(data.val[1], data.val[1]); - } - float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3)); - for(int i = 1; i < pool_size; ++i) - { - data = vld2q_f32(reinterpret_cast(src_ptrs[i] + in.offset())); - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - data.val[0] = vmulq_f32(data.val[0], data.val[0]); - data.val[1] = vmulq_f32(data.val[1], data.val[1]); - } - sum_data = vaddq_f32(sum_data, data.val[0]); - sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3)); - } - res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data)); - res = vmul_f32(vpadd_f32(res, res), scale_v); - } - else - { - float32x4x2_t max_data = vld2q_f32(reinterpret_cast(src_ptrs[0] + in.offset())); - for(int i = 1; i < pool_size; ++i) - { - const float32x4x2_t data = vld2q_f32(reinterpret_cast(src_ptrs[i] + in.offset())); - max_data = vmax2q_f32(max_data, data); - } - res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1])); - res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0]))); - res = vpmax_f32(res, res); - } - final_res = vget_lane_f32(res, 0); - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - final_res = sqrt(final_res); - } - - // Store result - *(reinterpret_cast(out.ptr())) = final_res; - }, - in, out); -} -} // namespace cpu -} // namespace arm_compute - -#endif // ENABLE_NCHW_KERNELS \ No newline at end of file diff --git a/src/core/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/core/cpu/kernels/pool2d/neon/qasymm8.cpp deleted file mode 100644 index 4020e9e3fc..0000000000 --- a/src/core/cpu/kernels/pool2d/neon/qasymm8.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/cpu/kernels/pool2d/neon/list.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - poolingMxN_q8_neon_nhwc(src, dst0, dst1, pool_info, window_src, window); -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/pool2d/neon/qasymm8_signed.cpp deleted file mode 100644 index a899427484..0000000000 --- a/src/core/cpu/kernels/pool2d/neon/qasymm8_signed.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/cpu/kernels/pool2d/neon/list.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - poolingMxN_q8_neon_nhwc(src, dst0, dst1, pool_info, window_src, window); -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/pool2d/neon/quantized.h b/src/core/cpu/kernels/pool2d/neon/quantized.h deleted file mode 100644 index a16960a205..0000000000 --- a/src/core/cpu/kernels/pool2d/neon/quantized.h +++ /dev/null @@ -1,863 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H -#define SRC_CORE_NEON_KERNELS_QUANTIZED_H - -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -template -inline typename std::enable_if::value, int8_t>::type -quantize(float val, const UniformQuantizationInfo &info) -{ - return quantize_qasymm8_signed(val, info); -} - -template -inline typename std::enable_if::value, uint8_t>::type -quantize(float val, const UniformQuantizationInfo &info) -{ - return quantize_qasymm8(val, info); -} - -template -inline T vcvtq_q32_f32(float32x4_t values); - -template <> -inline uint32x4_t vcvtq_q32_f32(float32x4_t values) -{ - return vcvtq_u32_f32(values); -} - -template <> -inline int32x4_t vcvtq_q32_f32(float32x4_t values) -{ - return vcvtq_s32_f32(values); -} - -template -inline float32x4_t vcvtq_f32_q32(T values); - -template <> -inline float32x4_t vcvtq_f32_q32(uint32x4_t values) -{ - return vcvtq_f32_u32(values); -} - -template <> -inline float32x4_t vcvtq_f32_q32(int32x4_t values) -{ - return vcvtq_f32_s32(values); -} - -template -inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset); - -template <> -inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) -{ - const float new_scale = quant_rescale / scale_pooling; - return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset)); -} - -template <> -inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) -{ - const float new_scale = quant_rescale / scale_pooling; - return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset)); -} - -template -inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo); - -template <> -inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) -{ - const float32x4x4_t acc = - { - { - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))), - } - }; - return vquantize(acc, requant_qinfo); -} - -template <> -inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) -{ - const float32x4x4_t acc = - { - { - vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))), - } - }; - return vquantize_signed(acc, requant_qinfo); -} - -template -inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo); - -template <> -inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) -{ - const float32x4x2_t acc = - { - { - vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))), - } - }; - return vquantize(acc, requant_qinfo); -} - -template <> -inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) -{ - const float32x4x2_t acc = - { - { - vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))), - } - }; - return vquantize_signed(acc, requant_qinfo); -} - -inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) -{ - const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - int start_x = id[idx_width] * stride_x - pad_x; - int start_y = id[idx_height] * stride_y - pad_y; - - const int end_x = std::min(start_x + pool_size_x, upper_bound_w); - const int end_y = std::min(start_y + pool_size_y, upper_bound_h); - if(exclude_padding) - { - start_x = std::max(0, start_x); - start_y = std::max(0, start_y); - } - return 1.f / ((end_y - start_y) * (end_x - start_x)); -} - -template -void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_step_x = 16; - const int window_half_step_x = window_step_x / 2; - - Window window_out = window; - window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(src, window_src); - Iterator out(dst0, window_out); - - using q8x8_t = typename wrapper::traits::neon_vector::type; - using q8x16_t = typename wrapper::traits::neon_vector::type; - using q16_t = typename wrapper::traits::promote_t; - using q16x8_t = typename wrapper::traits::neon_vector::type; - using q32_t = typename wrapper::traits::promote_t; - using q32x4_t = typename wrapper::traits::neon_vector::type; - - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - const float32x4_t half_scale_v = vdupq_n_f32(0.5f); - const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); - - const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; - // "new_offset" doesn't have to consider the "half_scale_v" in its computation - // With a requantization performed in a single step there won't be uncertainties introduced - const int32_t new_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / quant_rescale); - - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - if(pool_info.pool_type != PoolingType::MAX) - { - q32x4_t vres1 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres2 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres3 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres4 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - - // Calculate scale - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - - // Perform pooling - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const q8x16_t data = wrapper::vloadq(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - - const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); - const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); - vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); - vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); - vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); - vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); - } - } - - if(src_qinfo != dst_qinfo) - { - const float32x4x4_t vres = - { - { - vcvtq_f32_q32(vres1), - vcvtq_f32_q32(vres2), - vcvtq_f32_q32(vres3), - vcvtq_f32_q32(vres4), - } - }; - const auto requantized_dst = vrequantize_pooling_with_scale(vres, quant_rescale, scale, new_offset); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); - } - else - { - const float32x4_t scale_v = vdupq_n_f32(scale); - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); - - const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); - const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, res1); - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, res2); - } - } - else - { - q8x16_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_128_tag{}); - - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const q8x16_t data = wrapper::vloadq(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - vres = wrapper::vmax(vres, data); - } - } - - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling(wrapper::vgetlow(vres), wrapper::vgethigh(vres), - requant_qinfo) : - vres); - } - } - - if(pool_info.pool_type == PoolingType::MAX) - { - for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) - { - q8x8_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_64_tag{}); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const q8x8_t data = wrapper::vload(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - vres = wrapper::vmax(vres, data); - } - } - - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, - (src_qinfo != dst_qinfo) ? vrequantize_pooling(vres, requant_qinfo) : vres); - } - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - if(pool_info.pool_type != PoolingType::MAX) - { - q32_t res = static_cast(0.f); - - // Calculate scale - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - - // Perform pooling - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const T data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - res += data; - } - } - - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast(res); - const float new_scale = quant_rescale / scale; - const auto requantized_dst = quantize(res_f, UniformQuantizationInfo(new_scale, new_offset)); - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = requantized_dst; - } - else - { - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - res = static_cast(0.5f + static_cast(res) * scale); - - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - } - else - { - T res = std::numeric_limits::min(); - - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const T data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().z())) + x_off); - res = std::max(res, data); - } - } - - // Store result - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast(res); - *(reinterpret_cast(out.ptr()) + x_off) = quantize(res_f, requant_qinfo); - } - else - { - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - } - } - - }, - in, out); -} - -#if defined(ENABLE_NCHW_KERNELS) -template -inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step, - const int pool_size, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) -{ - int start_x = (id.x() + id_offset) * stride_x - pad_x; - int start_y = id.y() * stride_y - pad_y; - const int end_y = std::min(start_y + pool_size, upper_bound_h); - if(exclude_padding) - { - start_y = std::max(0, start_y); - } - - std::array elems = - { - { - wrapper::vgetlane(v, 0), - wrapper::vgetlane(v, 1), - wrapper::vgetlane(v, 2), - wrapper::vgetlane(v, 3), - wrapper::vgetlane(v, 4), - wrapper::vgetlane(v, 5), - wrapper::vgetlane(v, 6), - wrapper::vgetlane(v, 7), - } - }; - - for(auto &el : elems) - { - int c_start_x = start_x; - const int end_x = std::min(c_start_x + pool_size, upper_bound_w); - if(exclude_padding) - { - c_start_x = std::max(0, c_start_x); - } - float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x)); - el *= scale; - start_x += step * stride_x; - } - - v = wrapper::vsetlane(elems[0], v, 0); - v = wrapper::vsetlane(elems[1], v, 1); - v = wrapper::vsetlane(elems[2], v, 2); - v = wrapper::vsetlane(elems[3], v, 3); - v = wrapper::vsetlane(elems[4], v, 4); - v = wrapper::vsetlane(elems[5], v, 5); - v = wrapper::vsetlane(elems[6], v, 6); - v = wrapper::vsetlane(elems[7], v, 7); -} - -template -void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - Iterator in(src, window_src); - Iterator out(dst0, window); - - /** SIMD vector types */ - using q8x8_t = typename wrapper::traits::neon_vector::type; - using q8x16_t = typename wrapper::traits::neon_vector::type; - using q8x8x2_t = typename std::conditional::value, uint8x8x2_t, int8x8x2_t>::type; - using q16_t = typename wrapper::traits::promote_t; - using q16x4_t = typename wrapper::traits::neon_vector::type; - using q16x8_t = typename wrapper::traits::neon_vector::type; - using q16x8x2_t = typename wrapper::traits::neon_vector::type; - - constexpr int pool_size = 2; - int pool_stride_x = 0; - int pool_stride_y = 0; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - const T *const src_top_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top)))); - const T *const src_bottom_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1))); - - const int scale_step_x = (pool_stride_x == 1) ? 2 : 1; - - const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); - const bool have_different_qinfo = src_qinfo != dst_qinfo; - - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto top_data = wrapper::vloadq(src_top_ptr + in.offset()); - const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset()); - q8x8_t lower_res = {}; - q8x8_t upper_res = {}; - - if(pool_info.pool_type != PoolingType::MAX) - { - const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } }; - const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } }; - - // Add rows - const q16x8x2_t vrsum = - { - { - wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), - wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), - } - }; - - // Pair-wise add row data - const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0])); - const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1])); - - q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2); - - // Scale lower result - scale_vector_q16x8(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - lower_res = wrapper::vmovn(res_lower); - - // Compute upper result for stride_x == 1 - if(pool_stride_x == 1) - { - // Shifted row sum - const q16x8x2_t vrsum_shifted = - { - { - wrapper::vext_1(vrsum.val[0], vrsum.val[1]), - wrapper::vext_1(vrsum.val[1], vrsum.val[1]) - } - }; - - // Pair-wise add shifted row - q16x8_t res_upper = wrapper::vcombine( - wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])), - wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1]))); - - // Scale upper result - scale_vector_q16x8(pool_info.exclude_padding, res_upper, id, 1, 2, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - upper_res = wrapper::vmovn(res_upper); - } - } - else - { - const q8x16_t max_data = wrapper::vmax(top_data, bottom_data); - lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data)); - if(pool_stride_x == 1) - { - const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data); - upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted)); - } - } - - if(have_different_qinfo) - { - const auto requantized_dst = vrequantize_pooling(lower_res, upper_res, requant_qinfo); - lower_res = wrapper::vgetlow(requantized_dst); - upper_res = wrapper::vgethigh(requantized_dst); - } - - // Store result - if(pool_stride_x == 1) - { - const q8x8x2_t res = { { lower_res, upper_res } }; - wrapper::vstore(reinterpret_cast(out.ptr()), res); - } - else - { - wrapper::vstore(reinterpret_cast(out.ptr()), lower_res); - } - }, - in, out); -} - -template -void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - Iterator in(src, window_src); - Iterator out(dst0, window); - - /** SIMD vector types */ - using q8x8_t = typename wrapper::traits::neon_vector::type; - using q8x16_t = typename wrapper::traits::neon_vector::type; - using q8x8x2_t = typename std::conditional::value, uint8x8x2_t, int8x8x2_t>::type; - using q16_t = typename wrapper::traits::promote_t; - using q16x8_t = typename wrapper::traits::neon_vector::type; - using q16x8x2_t = typename wrapper::traits::neon_vector::type; - - constexpr int pool_size = 3; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); - - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - - const T *const src_top_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top)))); - const T *const src_middle_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1))); - const T *const src_bottom_ptr = reinterpret_cast(src->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2))); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto top_data = wrapper::vloadq(src_top_ptr + in.offset()); - const auto middle_data = wrapper::vloadq(src_middle_ptr + in.offset()); - const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset()); - q8x8_t fres = {}; - q8x16_t fqres = {}; - - if(pool_info.pool_type == PoolingType::AVG) - { - // Convert data to u16 - const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } }; - const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } }; - const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } }; - - // Calculate row sums - const q16x8x2_t vrsum = - { - { - wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]), - wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]), - } - }; - const q16x8x2_t vrsum_shifted_1 = - { - { - wrapper::vext_1(vrsum.val[0], vrsum.val[1]), - wrapper::vext_1(vrsum.val[1], vrsum.val[1]) - } - }; - const q16x8x2_t vrsum_shifted_2 = - { - { - wrapper::vext_2(vrsum.val[0], vrsum.val[1]), - wrapper::vext_2(vrsum.val[1], vrsum.val[1]) - } - }; - // Calculate final sum - q16x8x2_t final_sum = - { - { - wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]), - wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]), - } - }; - if(pool_stride_x == 2) - { - q16x8_t res = - { - wrapper::vgetlane(final_sum.val[0], 0), - wrapper::vgetlane(final_sum.val[0], 2), - wrapper::vgetlane(final_sum.val[0], 4), - wrapper::vgetlane(final_sum.val[0], 6), - wrapper::vgetlane(final_sum.val[1], 0), - wrapper::vgetlane(final_sum.val[1], 2), - wrapper::vgetlane(final_sum.val[1], 4), - wrapper::vgetlane(final_sum.val[1], 6), - }; - - scale_vector_q16x8(pool_info.exclude_padding, res, id, 0, 1, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - fres = wrapper::vmovn(res); - } - else - { - // Scale lower result - scale_vector_q16x8(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - // Scale lower result - scale_vector_q16x8(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1])); - } - } - else - { - const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data); - const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data); - const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data); - const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2); - - if(pool_stride_x == 2) - { - const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } }; - static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 }; - fres = wrapper::vtbl(table, lookup_val); - } - else - { - fqres = final_max; - } - } - - // Store result - if(pool_stride_x == 1) - { - if(src_qinfo != dst_qinfo) - { - fqres = vrequantize_pooling(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo); - } - wrapper::vstore(reinterpret_cast(out.ptr()), fqres); - } - else - { - if(src_qinfo != dst_qinfo) - { - fres = vrequantize_pooling(fres, requant_qinfo); - } - wrapper::vstore(reinterpret_cast(out.ptr()), fres); - } - }, - in, out); -} - -template -void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - Iterator in(src, window_src); - Iterator out(dst0, window); - - /** SIMD vector types */ - using q8x8_t = typename wrapper::traits::neon_vector::type; - using q16_t = typename wrapper::traits::promote_t; - using q16x8_t = typename wrapper::traits::neon_vector::type; - using q32_t = typename wrapper::traits::promote_t; - using q32x4_t = typename wrapper::traits::neon_vector::type; - - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - - const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); - - execute_window_loop(window, [&](const Coordinates & id) - { - T res = std::numeric_limits::min(); - - if(pool_info.pool_type != PoolingType::MAX) - { - q32x4_t vres = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32_t sres = 0; - - // Calculate scale - const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - - // Perform pooling - for(int y = 0; y < pool_size_y; ++y) - { - int x = 0; - for(; x <= (pool_size_x - 8); x += 8) - { - const q8x8_t data = wrapper::vload(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - - const q16x8_t data_q16 = wrapper::vmovl(data); - vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16))); - } - - // Leftover for loop - for(; x < pool_size_x; ++x) - { - T data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - sres += data; - } - } - - // Reduction - const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres)); - sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1); - - // Divide by scale - res = static_cast(support::cpp11::round(sres * scale)); - } - else - { - q8x8_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_64_tag{}); - - for(int y = 0; y < pool_size_y; ++y) - { - int x = 0; - for(; x <= (pool_size_x - 8); x += 8) - { - const q8x8_t data = wrapper::vload(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - vres = wrapper::vmax(vres, data); - } - // Leftover for loop - for(; x < pool_size_x; ++x) - { - const T data = *(reinterpret_cast(in.ptr() + (x - pool_pad_left) * static_cast(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast - (src->info()->strides_in_bytes().y()))); - res = std::max(res, data); - } - } - - // Reduce max - vres = wrapper::vpmax(vres, vres); - vres = wrapper::vpmax(vres, vres); - vres = wrapper::vpmax(vres, vres); - - // Get max value - res = std::max(res, wrapper::vgetlane(vres, 0)); - } - // Store result - res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper::quantize(Qasymm8QuantizationHelper::dequantize(res, src_qinfo), dst_qinfo) : res; - *(reinterpret_cast(out.ptr())) = res; - }, - in, out); -} -#endif /* defined(ENABLE_NCHW_KERNELS) */ -} // namespace cpu -} // namespace arm_compute - -#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H diff --git a/src/core/cpu/kernels/scale/neon/fp16.cpp b/src/core/cpu/kernels/scale/neon/fp16.cpp deleted file mode 100644 index 0ad66cab1c..0000000000 --- a/src/core/cpu/kernels/scale/neon/fp16.cpp +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -#include -#include -#include - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) - -namespace arm_compute -{ -namespace -{ -void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const int window_step_x = 8; - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const float16_t *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(reinterpret_cast(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) - { - *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); -} - -void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { - using ConstType = typename std::conditional::value, half, float16_t>::type; - - const float16_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const float16_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} -namespace cpu -{ -void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute - -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/neon/integer.cpp b/src/core/cpu/kernels/scale/neon/integer.cpp deleted file mode 100644 index a2359aac94..0000000000 --- a/src/core/cpu/kernels/scale/neon/integer.cpp +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace -{ -void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const int window_step_x = 16; - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const uint8_t *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(reinterpret_cast(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) - { - *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); -} - -void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { - const uint8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const uint8_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} - -void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const int window_step_x = 8; - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const int16_t *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(reinterpret_cast(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) - { - *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); -} - -void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { - const int16_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const int16_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} -namespace cpu -{ -void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); - } -} - -void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/neon/list.h b/src/core/cpu/kernels/scale/neon/list.h deleted file mode 100644 index c91242f5b2..0000000000 --- a/src/core/cpu/kernels/scale/neon/list.h +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_SCALE_LIST_H -#define SRC_CORE_NEON_KERNELS_SCALE_LIST_H - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -namespace arm_compute -{ -namespace cpu -{ -#define DECLARE_SCALE_KERNEL(func_name) \ - void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \ - bool align_corners, const Window &window) - -DECLARE_SCALE_KERNEL(qasymm8_neon_scale); -DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale); - -#undef DECLARE_SCALE_KERNEL - -template -void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset, - bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const int window_step_x = 16 / sizeof(T); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const T *in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(reinterpret_cast(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) - { - *(reinterpret_cast(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); -} - -template -void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - using ConstType = typename std::conditional::value, half, T>::type; -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - using ConstType = T; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - const T const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const T *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} - -template -void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - bilinear_neon_scale(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - nearest_neon_scale(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_SCALE_LIST_H */ diff --git a/src/core/cpu/kernels/scale/neon/qasymm8.cpp b/src/core/cpu/kernels/scale/neon/qasymm8.cpp deleted file mode 100644 index 90302ce889..0000000000 --- a/src/core/cpu/kernels/scale/neon/qasymm8.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/scale/neon/list.h" - -namespace arm_compute -{ -namespace -{ -void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Data layout is NHWC - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - Window win_off; - win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(1, Window::Dimension(0, 0, 0)); - win_in.set(2, Window::Dimension(0, 0, 0)); - - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - Iterator in(src, win_in); - Iterator out(dst, window); - - const int32_t in_dim_w = src->info()->dimension(1); - const int32_t in_dim_h = src->info()->dimension(2); - const int32_t stride_w = src->info()->strides_in_bytes()[1]; - const int32_t stride_h = src->info()->strides_in_bytes()[2]; - - const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - if(border_mode == BorderMode::CONSTANT) - { - const uint8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[1], id[2])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[1], id[2])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - auto clamped_w = utility::clamp(index_w, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(index_h, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); - const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); - const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); - const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} -namespace cpu -{ -void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - nearest_neon_scale(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp deleted file mode 100644 index 07d6c6ef03..0000000000 --- a/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/scale/neon/list.h" - -namespace arm_compute -{ -namespace -{ -void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Data layout is NHWC - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - Window win_off; - win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(1, Window::Dimension(0, 0, 0)); - win_in.set(2, Window::Dimension(0, 0, 0)); - - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - Iterator in(src, win_in); - Iterator out(dst, window); - - const int32_t in_dim_w = src->info()->dimension(1); - const int32_t in_dim_h = src->info()->dimension(2); - const int32_t stride_w = src->info()->strides_in_bytes()[1]; - const int32_t stride_h = src->info()->strides_in_bytes()[2]; - - const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - if(border_mode == BorderMode::CONSTANT) - { - const int8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[1], id[2])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[1], id[2])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - auto clamped_w = utility::clamp(index_w, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(index_h, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); - const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); - const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); - const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} -namespace cpu -{ -void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - nearest_neon_scale(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/fp16.cpp b/src/core/cpu/kernels/scale/sve/fp16.cpp deleted file mode 100644 index 76e7735b8a..0000000000 --- a/src/core/cpu/kernels/scale/sve/fp16.cpp +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace -{ -void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do - { - // Store results - svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - out); -} - -void fp16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { - using ConstType = typename std::conditional::value, half, float16_t>::type; - - const float16_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const float16_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} -namespace cpu -{ -void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - fp16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute - -#endif // ARM_COMPUTE_ENABLE_SVE \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/fp32.cpp b/src/core/cpu/kernels/scale/sve/fp32.cpp deleted file mode 100644 index 030e109cdf..0000000000 --- a/src/core/cpu/kernels/scale/sve/fp32.cpp +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -#include -#include - -#include - -namespace arm_compute -{ -namespace -{ -void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b32(x, window_end_x); - do - { - // Store results - svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b32(x, window_end_x); - } - while(svptest_any(svptrue_b32(), pg)); - }, - out); -} - -void fp32_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { - const float const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const float *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} -namespace cpu -{ -void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - fp32_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute - -#endif // ARM_COMPUTE_ENABLE_SVE \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/integer.cpp b/src/core/cpu/kernels/scale/sve/integer.cpp deleted file mode 100644 index 486c674612..0000000000 --- a/src/core/cpu/kernels/scale/sve/integer.cpp +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace -{ -void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - // Store results - svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - out); -} - -void u8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { - const uint8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const uint8_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} - -void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do - { - // Store results - svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - out); -} - -void s16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - - Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_dim_w = src->info()->dimension(1); - const int in_dim_h = src->info()->dimension(2); - const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); - - // Don't increment in Y and Z direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(src, win_in); - - if(border_mode == BorderMode::CONSTANT) - { - const int16_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const int16_t *in_ptr = reinterpret_cast(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast(out.ptr()) = static_cast(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} -namespace cpu -{ -void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - u8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); - } -} - -void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - s16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute - -#endif // ARM_COMPUTE_ENABLE_SVE \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/list.h b/src/core/cpu/kernels/scale/sve/list.h deleted file mode 100644 index b9c3a10a78..0000000000 --- a/src/core/cpu/kernels/scale/sve/list.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_SCALE_LIST_H -#define SRC_CORE_SVE_KERNELS_SCALE_LIST_H - -namespace arm_compute -{ -namespace cpu -{ -#define DECLARE_SCALE_KERNEL(func_name) \ - void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \ - bool align_corners, const Window &window) - -DECLARE_SCALE_KERNEL(fp16_sve_scale); -DECLARE_SCALE_KERNEL(fp32_sve_scale); -DECLARE_SCALE_KERNEL(s16_sve_scale); -DECLARE_SCALE_KERNEL(u8_sve_scale); -DECLARE_SCALE_KERNEL(qasymm8_sve_scale); -DECLARE_SCALE_KERNEL(qasymm8_signed_sve_scale); - -#undef DECLARE_SCALE_KERNEL -} // namespace cpu -} // namespace arm_compute - -#endif /* SRC_CORE_SVE_KERNELS_SCALE_LIST_H */ diff --git a/src/core/cpu/kernels/scale/sve/qasymm8.cpp b/src/core/cpu/kernels/scale/sve/qasymm8.cpp deleted file mode 100644 index c9122ad40b..0000000000 --- a/src/core/cpu/kernels/scale/sve/qasymm8.cpp +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace -{ -void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - // Store results - svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - out); -} - -void qasymm8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Data layout is NHWC - const int idx_width = 1; - const int idx_height = 2; - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners); - Window win_off; - win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(idx_width, Window::Dimension(0, 0, 0)); - win_in.set(idx_height, Window::Dimension(0, 0, 0)); - - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - Iterator in(src, win_in); - Iterator out(dst, window); - - const int32_t in_dim_w = src->info()->dimension(idx_width); - const int32_t in_dim_h = src->info()->dimension(idx_height); - const int32_t stride_w = src->info()->strides_in_bytes()[idx_width]; - const int32_t stride_h = src->info()->strides_in_bytes()[idx_height]; - - const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - if(border_mode == BorderMode::CONSTANT) - { - const uint8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - auto clamped_w = utility::clamp(index_w, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(index_h, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); - const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); - const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); - const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} -namespace cpu -{ -void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - qasymm8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute - -#endif // defined(ARM_COMPUTE_ENABLE_SVE) \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp deleted file mode 100644 index 0843e61fd4..0000000000 --- a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace -{ -void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) -{ - const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; - const size_t in_stride_wc = in_stride_w * in_stride_c; - const size_t in_dim_h = src->info()->dimension(2); - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator out(dst, win); - - const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - // Store results - svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - out); -} - -void qasymm8_signed_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Data layout is NHWC - const int idx_width = 1; - const int idx_height = 2; - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners); - Window win_off; - win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(idx_width, Window::Dimension(0, 0, 0)); - win_in.set(idx_height, Window::Dimension(0, 0, 0)); - - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - Iterator in(src, win_in); - Iterator out(dst, window); - - const int32_t in_dim_w = src->info()->dimension(idx_width); - const int32_t in_dim_h = src->info()->dimension(idx_height); - const int32_t stride_w = src->info()->strides_in_bytes()[idx_width]; - const int32_t stride_h = src->info()->strides_in_bytes()[idx_height]; - - const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - if(border_mode == BorderMode::CONSTANT) - { - const int8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - auto clamped_w = utility::clamp(index_w, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(index_h, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); - const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); - const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); - const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} -namespace cpu -{ -void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - if(policy == InterpolationPolicy::BILINEAR) - { - qasymm8_signed_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); - } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); - } -} -} // namespace cpu -} // namespace arm_compute - -#endif // ARM_COMPUTE_ENABLE_SVE \ No newline at end of file diff --git a/src/core/cpu/kernels/softmax/impl/neon/list.h b/src/core/cpu/kernels/softmax/impl/neon/list.h deleted file mode 100644 index 5ebee31272..0000000000 --- a/src/core/cpu/kernels/softmax/impl/neon/list.h +++ /dev/null @@ -1,388 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H -#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H - -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "support/SaturateCast.h" - -namespace arm_compute -{ -namespace cpu -{ -template -void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - constexpr int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(in, win); - Iterator output(out, win); - - const int sum_stages = log2(window_step_x / 2); - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); - - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest(), ExactTagType{}); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - - for(int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } - - *out_ptr = max_val; - }, - input, output); -} - -template -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) -{ - static_assert(std::is_same::value - || std::is_same::value, - "quantized type should be either qasymm8_t or qasymm8_signed_t."); - - const int start_x = in->info()->valid_region().anchor.x(); - const int input_width = in->info()->valid_region().shape.x(); - - const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; - const auto scale_beta_vec = vdupq_n_f32(scale_beta); - - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); - constexpr int vec_size = 16; - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - float sum{}; - float sum_inversed{}; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); - - /* Init sum to zero */ - float32x4x4_t vec_sum = - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - }; - - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float(vec_elements); - - if(is_log) - { - vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); - vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); - vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); - vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); - } - else - { - vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); - vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); - vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); - vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); - } - - vst4q_f32(tmp_ptr + x, vec_elements_flt); - } - - /* Reduce sum */ - const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); - auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); - sum_res = vpadd_f32(sum_res, sum_res); - sum = wrapper::vgetlane(sum_res, 0); - - /* Run remaining elements */ - for(; x < input_width; ++x) - { - float element{}; - if(is_log) - { - element = (max_val - in_ptr[x]) * scale_beta; - sum += std::exp(element); - } - else - { - element = std::exp((max_val - in_ptr[x]) * scale_beta); - sum += element; - } - - tmp_ptr[x] = element; - } - - if(!is_log) - { - sum_inversed = 256.f / sum; - } - else - { - sum = std::log(sum); - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same::value; - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - using int_vec_type = wrapper::traits::neon_vector_t; - float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); - int_vec_type normalized_value{}; - if(is_log) - { - const float32x4x4_t sub = - { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), - }; - normalized_value = convert_float_to_int(sub); - } - else - { - float32x4x4_t mul = - { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), - }; - - if(is_qasymm8_signed) - { - const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); - mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); - mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); - mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); - mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); - } - - normalized_value = convert_float_to_int(mul); - } - wrapper::vstore(out_ptr + x, normalized_value); - } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - if(is_log) - { - out_ptr[x] = utils::cast::saturate_cast(tmp_ptr[x] - sum); - } - else - { - out_ptr[x] = utils::cast::saturate_cast((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0)); - } - } - } - }, - in_it, max_it, out_it); -} - -template -void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) -{ - const int start_x = in->info()->valid_region().anchor.x(); - const int input_width = in->info()->valid_region().shape.x(); - - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); - - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - constexpr int vec_size = 16 / sizeof(T); - const int sum_stages = log2(vec_size / 2); - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - T sum{}; - T sum_inversed{}; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - - /* Init sum to zero */ - auto vec_sum = wrapper::vdup_n(static_cast(0), ExactTagType{}); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vsub(vec_elements, vec_max); - if(is_log) - { - vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); - } - else - { - vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); - } - wrapper::vstore(tmp_ptr + x, vec_elements); - } - - /* Reduce sum */ - auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); - for(int i = 0; i < sum_stages; ++i) - { - sum_res = wrapper::vpadd(sum_res, sum_res); - } - sum = wrapper::vgetlane(sum_res, 0); - - /* Run remaining elements */ - for(; x < input_width; ++x) - { - T element{}; - - if(is_log) - { - element = (in_ptr[x] - max_val) * beta; - sum += std::exp(element); - } - else - { - element = std::exp((in_ptr[x] - max_val) * beta); - sum += element; - } - tmp_ptr[x] = element; - } - - if(!is_log) - { - sum_inversed = T(1) / sum; - } - else - { - sum = static_cast(std::log(sum)); - } - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast(0), ExactTagType{}); - if(is_log) - { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast(sum), ExactTagType{})); - } - else - { - normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast(sum_inversed), ExactTagType{})); - } - wrapper::vstore(out_ptr + x, normalized_value); - } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - if(is_log) - { - out_ptr[x] = tmp_ptr[x] - sum; - } - else - { - out_ptr[x] = tmp_ptr[x] * sum_inversed; - } - } - } - }, - in_it, max_it, out_it); -} - -} // namespace cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */ diff --git a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp deleted file mode 100644 index 7a577fd565..0000000000 --- a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -template -void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) -{ - const auto all_true_pg = wrapper::svptrue(); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); - - // Init max value - auto vec_max = wrapper::svdup_n(support::cpp11::lowest()); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto current_value = svld1(pg, in_ptr + x); - vec_max = svmax_m(pg, vec_max, current_value); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - - auto max_val = svmaxv(all_true_pg, vec_max); - - *out_ptr = max_val; - }, - input, output); -} - -template -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) -{ - const int start_x = in->info()->valid_region().anchor.x(); - const int input_width = in->info()->valid_region().shape.x(); - - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); - - const auto all_true_pg = wrapper::svptrue(); - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - ScalarType sum{ 0 }; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - - /* Init sum to zero */ - auto vec_sum = wrapper::svdup_n(static_cast(0)); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - do - { - auto vec_elements = svld1(pg, in_ptr + x); - vec_elements = svsub_z(pg, vec_elements, vec_max); - if(is_log) - { - vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast(beta))); - vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); - } - else - { - vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast(beta)))); - vec_sum = svadd_m(pg, vec_sum, vec_elements); - } - svst1(pg, tmp_ptr + x, vec_elements); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - } - while(svptest_any(all_true_pg, pg)); - - /* Reduce sum */ - sum = svaddv(all_true_pg, vec_sum); - - if(is_log) - { - sum = static_cast(std::log(sum)); - } - else - { - sum = ScalarType(1) / sum; - } - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - do - { - auto vec_in = svld1(pg, tmp_ptr + x); - auto normalized_value = wrapper::svdup_n(static_cast(0)); - if(is_log) - { - normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); - } - else - { - normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); - } - svst1(pg, out_ptr + x, normalized_value); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); -} - -template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); - -template void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); -template void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); -} // namespace cpu -} // namespace arm_compute -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ diff --git a/src/core/cpu/kernels/softmax/impl/sve/list.h b/src/core/cpu/kernels/softmax/impl/sve/list.h deleted file mode 100644 index b4e1e1b186..0000000000 --- a/src/core/cpu/kernels/softmax/impl/sve/list.h +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H -#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H - -#if defined(ARM_COMPUTE_ENABLE_SVE) -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -template -void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); - -template -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); - -#if defined(ARM_COMPUTE_ENABLE_SVE2) -template -void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) -{ - const int start_x = in->info()->valid_region().anchor.x(); - const int input_width = in->info()->valid_region().shape.x(); - - const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; - const auto scale_beta_vec = svdup_n_f32(scale_beta); - - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); - const auto all_true_pg = wrapper::svptrue(); - using SVEType = typename wrapper::traits::sve_vector::type; - - const int inc_1 = static_cast(svcntw()); - const int inc_2 = static_cast(2 * svcntw()); - const int inc_3 = static_cast(3 * svcntw()); - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - float sum{}; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - - /* Init sum to zero */ - auto vec_sum_0 = svdup_n_f32(0.f); - auto vec_sum_1 = svdup_n_f32(0.f); - auto vec_sum_2 = svdup_n_f32(0.f); - auto vec_sum_3 = svdup_n_f32(0.f); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do - { - auto vec_elements = svld1(pg, in_ptr + x); - vec_elements = svsub_z(pg, vec_max, vec_elements); - - auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements))); - auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements))); - auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements))); - auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements))); - - if(is_log) - { - vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); - vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); - vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); - vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); - } - else - { - vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); - vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); - vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); - vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); - } - - svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); - svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); - svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); - svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); - } - while(svptest_any(all_true_pg, pg)); - - /* Reduce sum */ - const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); - sum = svaddv_f32(all_true_pg, vec_sum); - - /* Run remaining elements */ - x = 0; - if(is_log) - { - sum = std::log(sum); - } - else - { - sum = 256.f / sum; - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same::value; - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do - { - auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); - auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); - auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); - auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); - - svfloat32_t res_0{}; - svfloat32_t res_1{}; - svfloat32_t res_2{}; - svfloat32_t res_3{}; - - if(is_log) - { - res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); - } - else - { - res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); - - if(is_qasymm8_signed) - { - const auto offset_vec = svdup_n_f32(128.f); - res_0 = svsub_z(pg_0, vec_in_0, offset_vec); - res_1 = svsub_z(pg_1, vec_in_1, offset_vec); - res_2 = svsub_z(pg_2, vec_in_2, offset_vec); - res_3 = svsub_z(pg_3, vec_in_3, offset_vec); - } - } - - // Store value - const auto out = convert_float_to_int(res_0, res_1, res_2, res_3); - svst1(pg, out_ptr + x, out); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); - } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); -} -#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ -} // namespace cpu -} // namespace arm_compute -#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ - -#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */ diff --git a/src/core/cpu/kernels/sub/neon/list.h b/src/core/cpu/kernels/sub/neon/list.h deleted file mode 100644 index ac1346001a..0000000000 --- a/src/core/cpu/kernels/sub/neon/list.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_SUB_LIST_H -#define SRC_CORE_NEON_KERNELS_SUB_LIST_H - -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/wrapper.h" - -namespace arm_compute -{ -namespace cpu -{ -#define DECLARE_SUB_KERNEL(func_name) \ - void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) - -DECLARE_SUB_KERNEL(sub_qasymm8_neon); -DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon); -DECLARE_SUB_KERNEL(sub_qsymm16_neon); - -#undef DECLARE_SUB_KERNEL - -template -void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - bool is_sat = policy == ConvertPolicy::SATURATE; - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - constexpr int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); - Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); - Iterator output(dst, window); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const T broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v); - if(is_broadcast_input_2) - { - res = wrapper::vmul(res, wrapper::vdup_n(static_cast(-1), ExactTagType{})); - } - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; - if(is_broadcast_input_2) - { - res = static_cast(-1) * res; - } - - *(output_ptr + x) = res; - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto val1 = wrapper::vloadq(input1_ptr + x); - const auto val2 = wrapper::vloadq(input2_ptr + x); - const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto val1 = *(input1_ptr + x); - const auto val2 = *(input2_ptr + x); - *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2; - } - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_SUB_LIST_H diff --git a/src/core/cpu/kernels/sub/neon/qasymm8.cpp b/src/core/cpu/kernels/sub/neon/qasymm8.cpp deleted file mode 100644 index 8f4cd8bdbb..0000000000 --- a/src/core/cpu/kernels/sub/neon/qasymm8.cpp +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - ARM_COMPUTE_UNUSED(policy); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); - const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); - const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); - const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(static_cast(broadcast_value), wrapper::traits::vector_128_tag{}); - - const float32x4x4_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), - } - }; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(non_broadcast_input_ptr + x); - - const float32x4x4_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - } - }; - - const int32x4x4_t rf = - { - { -#ifdef __aarch64_ - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; - const float bfs = static_cast(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qasymm8(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info()); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); - const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); - const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); - const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - - const float32x4x4_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - } - }; - - const float32x4x4_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), - } - }; - - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; - - *(output_ptr + x) = quantize_qasymm8((afs - bfs), dst->info()->quantization_info()); - } - }, - input1, input2, output); - } -} - -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp deleted file mode 100644 index 2c9e411743..0000000000 --- a/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - ARM_COMPUTE_UNUSED(policy); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); - const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); - const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); - const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(static_cast(broadcast_value), wrapper::traits::vector_128_tag{}); - - const float32x4x4_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), - } - }; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(non_broadcast_input_ptr + x); - - const float32x4x4_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - } - }; - - const int32x4x4_t rf = - { - { -#ifdef __aarch64_ - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; - const float bfs = static_cast(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qasymm8_signed(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info()); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); - const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); - const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); - const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - - const float32x4x4_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), - } - }; - - const float32x4x4_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), - } - }; - - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; - - *(output_ptr + x) = quantize_qasymm8_signed((afs - bfs), dst->info()->quantization_info()); - } - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/sub/neon/qsymm16.cpp b/src/core/cpu/kernels/sub/neon/qsymm16.cpp deleted file mode 100644 index 4dfdc0e78c..0000000000 --- a/src/core/cpu/kernels/sub/neon/qsymm16.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - ARM_COMPUTE_UNUSED(policy); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - - const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); - const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int16_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); - - const float32x4x2_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), - } - }; - const float bfs = static_cast(broadcast_value) * broadcast_qinfo.scale; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); - const float32x4x2_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), - } - }; - - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8_t a = vld1q_s16(input1_ptr + x); - const int16x8_t b = vld1q_s16(input2_ptr + x); - - const float32x4x2_t af = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), - } - }; - - const float32x4x2_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2), - } - }; - - const int32x4x2_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), -#else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), -#endif //__aarch64__ - } - }; - - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast((*(input1_ptr + x))) * iq1_info.scale; - const float bfs = static_cast((*(input2_ptr + x))) * iq2_info.scale; - *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info()); - } - }, - input1, input2, output); - } -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/ClCompileContext.h b/src/core/gpu/cl/ClCompileContext.h deleted file mode 100644 index e69cc0200f..0000000000 --- a/src/core/gpu/cl/ClCompileContext.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_COMPILE_CONTEXT_H -#define ARM_COMPUTE_CL_COMPILE_CONTEXT_H - -#include "arm_compute/core/CL/CLCompileContext.h" - -namespace arm_compute -{ -namespace opencl -{ -using ClCompileContext = arm_compute::CLCompileContext; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_COMPILE_CONTEXT_H */ diff --git a/src/core/gpu/cl/ClKernelLibrary.cpp b/src/core/gpu/cl/ClKernelLibrary.cpp deleted file mode 100644 index 4a9ba874b1..0000000000 --- a/src/core/gpu/cl/ClKernelLibrary.cpp +++ /dev/null @@ -1,1029 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/ClKernelLibrary.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Utils.h" - -#include -#include -#include -#include - -#ifdef ARM_COMPUTE_COMPRESSED_KERNELS -#include - -namespace -{ -/* Decoding table */ -constexpr std::array b64_invtab = -{ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63, - 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0, - 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, - 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/** Decode a base64 encoded string - * - * @param[in] str Base64 encoded string to decode - * - * @return The decode string in case of a valid, non-empty string otherwise an empty string - */ -std::string decode_base64(const std::string &str) -{ - constexpr const char pad_char = '='; - - // Handle empty string - if(str.empty()) - { - return {}; - } - - // Base64 encoded string has size multiple of 4 - if(str.length() % 4) - { - return {}; - } - - // - // Check encoded string padding - std::size_t padding = (str.rbegin()[0] == pad_char) + (str.rbegin()[1] == pad_char); - const int str_len = str.size(); - - // Reserve memory for the decoded string - // Note each 4 consecutive elements of 6-bit encode 3 bytes - std::string dec_b64; - dec_b64.reserve(((str_len / 4) * 3)); - - // Block decoding function (exclude padding) - int c = 0; - const int end = str_len - 4 - padding; - for(; c <= end; c += 4) - { - const int byte0 = b64_invtab[str[c]]; - const int byte1 = b64_invtab[str[c + 1]]; - const int byte2 = b64_invtab[str[c + 2]]; - const int byte3 = b64_invtab[str[c + 3]]; - - dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); - dec_b64.push_back((byte1 << 4) | (byte2 >> 2)); - dec_b64.push_back((byte2 << 6) | (byte3)); - } - - // Last step that might contain padding symbols - if(padding == 1) - { - const int byte0 = b64_invtab[str[c]]; - const int byte1 = b64_invtab[str[c + 1]]; - const int byte2 = b64_invtab[str[c + 2]]; - - dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); - dec_b64.push_back((byte1 << 4) | (byte2 >> 2)); - } - else if(padding == 2) - { - const int byte0 = b64_invtab[str[c]]; - const int byte1 = b64_invtab[str[c + 1]]; - - dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); - } - - return dec_b64; -} - -/** Decompress a zlib compressed string - * - * @param[in] str ZLib compressed string - * - * @return The decompressed string if successful, otherwise false. - */ -std::string decompress_zlib(const std::string &str) -{ - // Create and initialize decompression stream - z_stream ds{}; - if(inflateInit(&ds) != Z_OK) - { - return std::string(); - } - ds.avail_in = str.size(); - ds.next_in = (Bytef *)str.data(); - - // Roll-over the string using a buffer and decompress - int status = Z_OK; - char roll_buff[16384]; - std::string inflated_str; - do - { - ds.avail_out = sizeof(roll_buff); - ds.next_out = reinterpret_cast(roll_buff); - - status = inflate(&ds, 0); - if(inflated_str.size() < ds.total_out) - { - inflated_str.append(roll_buff, ds.total_out - inflated_str.size()); - } - } - while(status == Z_OK); - - // Finalize decompression stream - inflateEnd(&ds); - if(status != Z_STREAM_END) - { - return std::string(); - } - - return inflated_str; -} -} // namespace -#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ - -namespace arm_compute -{ -namespace opencl -{ -const std::map ClKernelLibrary::_kernel_program_map = -{ - // Common Kernels - { "activation_layer", "common/activation_layer.cl" }, - { "activation_layer_quant", "common/activation_layer_quant.cl" }, - { "activation_layer_quant_f32", "common/activation_layer_quant.cl" }, - { "arg_min_max_x", "common/arg_min_max.cl" }, - { "arg_min_max_y", "common/arg_min_max.cl" }, - { "arg_min_max_z", "common/arg_min_max.cl" }, - { "arg_min_max_w", "common/arg_min_max.cl" }, - { "bitwise_or", "common/bitwise_op.cl" }, - { "bitwise_and", "common/bitwise_op.cl" }, - { "bitwise_xor", "common/bitwise_op.cl" }, - { "bitwise_not", "common/bitwise_op.cl" }, - { "bounding_box_transform", "common/bounding_box_transform.cl" }, - { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" }, - { "compare_equal", "common/comparisons.cl" }, - { "compare_equal_quantized", "common/comparisons.cl" }, - { "compare_notequal", "common/comparisons.cl" }, - { "compare_notequal_quantized", "common/comparisons.cl" }, - { "compare_greater", "common/comparisons.cl" }, - { "compare_greater_quantized", "common/comparisons.cl" }, - { "compare_greaterequal", "common/comparisons.cl" }, - { "compare_greaterequal_quantized", "common/comparisons.cl" }, - { "compare_less", "common/comparisons.cl" }, - { "compare_less_quantized", "common/comparisons.cl" }, - { "compare_lessequal", "common/comparisons.cl" }, - { "compare_lessequal_quantized", "common/comparisons.cl" }, - { "concatenate", "common/concatenate.cl" }, - { "concatenate_width", "common/concatenate.cl" }, - { "concatenate_height", "common/concatenate.cl" }, - { "concatenate_width_x2", "common/concatenate.cl" }, - { "concatenate_width_x4", "common/concatenate.cl" }, - { "col2im", "common/col2im.cl" }, - { "cast_down", "common/cast.cl" }, - { "cast_up", "common/cast.cl" }, - { "convert_fc_weights", "common/convert_fc_weights.cl" }, - { "copy_tensor", "common/copy_tensor.cl" }, - { "crop_tensor", "common/crop_tensor.cl" }, - { "deconvolution_reshape", "common/deconvolution_layer.cl" }, - { "deconvolution_upsample", "common/deconvolution_layer.cl" }, - { "dequantization_layer", "common/dequantization_layer.cl" }, - { "elementwise_operation_ADD", "common/elementwise_operation.cl" }, - { "elementwise_operation_SUB", "common/elementwise_operation.cl" }, - { "elementwise_operation_MAX", "common/elementwise_operation.cl" }, - { "elementwise_operation_MIN", "common/elementwise_operation.cl" }, - { "elementwise_operation_DIV", "common/elementwise_operation.cl" }, - { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" }, - { "elementwise_operation_POWER", "common/elementwise_operation.cl" }, - { "elementwise_operation_PRELU", "common/elementwise_operation.cl" }, - { "elementwise_operation_AND", "common/elementwise_operation.cl" }, - { "elementwise_operation_OR", "common/elementwise_operation.cl" }, - { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_unary", "common/elementwise_unary.cl" }, - { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" }, - { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" }, - { "fft_radix_2_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_2_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_2_axis_0", "common/fft.cl" }, - { "fft_radix_2_axis_1", "common/fft.cl" }, - { "fft_radix_3_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_3_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_3_axis_0", "common/fft.cl" }, - { "fft_radix_3_axis_1", "common/fft.cl" }, - { "fft_radix_4_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_4_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_4_axis_0", "common/fft.cl" }, - { "fft_radix_4_axis_1", "common/fft.cl" }, - { "fft_radix_5_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_5_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_5_axis_0", "common/fft.cl" }, - { "fft_radix_5_axis_1", "common/fft.cl" }, - { "fft_radix_7_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_7_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_7_axis_0", "common/fft.cl" }, - { "fft_radix_7_axis_1", "common/fft.cl" }, - { "fft_radix_8_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_8_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_8_axis_0", "common/fft.cl" }, - { "fft_radix_8_axis_1", "common/fft.cl" }, - { "fft_scale_conj", "common/fft_scale.cl" }, - { "fill_image_borders_constant", "common/fill_border.cl" }, - { "fill_image_borders_replicate", "common/fill_border.cl" }, - { "floor_layer", "common/floor.cl" }, - { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" }, - { "gather", "common/gather.cl" }, - { "gemm_ma_f16", "common/gemm.cl" }, - { "gemm_ma_f32", "common/gemm.cl" }, - { "gemm_mv", "common/gemv.cl" }, - { "gemm_mv_quantized", "common/gemv.cl" }, - { "gemm_mm_interleaved_transposed_f16", "common/gemm_v1.cl" }, - { "gemm_mm_interleaved_transposed_f16_acc32", "common/gemm_v1.cl" }, - { "gemm_mm_interleaved_transposed_f16_bifrost", "common/gemm_v1.cl" }, - { "gemm_mm_interleaved_transposed_f32", "common/gemm_v1.cl" }, - { "gemm_mm_interleaved_transposed_f32_bifrost", "common/gemm_v1.cl" }, - { "gemm_mm_floating_point", "common/gemm_v1.cl" }, - { "gemm_mm_floating_point_f16_bifrost", "common/gemm_v1.cl" }, - { "gemm_mm_floating_point_f16_bifrost_acc32", "common/gemm_v1.cl" }, - { "gemm_mm_floating_point_f32_bifrost", "common/gemm_v1.cl" }, - { "gemm_mm_floating_point_f32_bifrost_1000", "common/gemm_v1.cl" }, - { "gemm_mm_native", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" }, - { "gemm_lc_vm_f32", "common/gemm.cl" }, - { "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" }, - { "gemm_reshape_lhs_matrix_t", "common/gemm.cl" }, - { "gemm_reshape_rhs_matrix_nt", "common/gemm.cl" }, - { "gemm_reshape_rhs_matrix_t", "common/gemm.cl" }, - { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" }, - { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" }, - { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" }, - { "gemmlowp_mm_native", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" }, - { "gemmlowp_offset_contribution", "common/gemmlowp.cl" }, - { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" }, - { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" }, - { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" }, - { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" }, - { "instance_normalization", "common/instance_normalization.cl" }, - { "compute_mean_var", "common/instance_normalization.cl" }, - { "l2_normalize_x", "common/l2_normalize.cl" }, - { "l2_normalize_y", "common/l2_normalize.cl" }, - { "l2_normalize_z", "common/l2_normalize.cl" }, - { "max_unpooling_layer_2", "common/unpooling_layer.cl" }, - { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" }, - { "memset", "common/memset.cl" }, - { "minmax_layer", "common/minmax_layer.cl" }, - { "non_max_suppression", "common/nonmax.cl" }, - { "pad_layer_constant", "common/pad_layer.cl" }, - { "pad_layer_symmetric_reflect", "common/pad_layer.cl" }, - { "permute", "common/permute.cl" }, - { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" }, - { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" }, - { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" }, - { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" }, - { "pooling_layer_2", "common/pooling_layer.cl" }, - { "pooling_layer_3", "common/pooling_layer.cl" }, - { "pooling_layer_optimized_3", "common/pooling_layer.cl" }, - { "pooling_layer_7", "common/pooling_layer.cl" }, - { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" }, - { "quantization_layer", "common/quantization_layer.cl" }, - { "range", "common/range.cl" }, - { "range_quantized", "common/range.cl" }, - { "reduction_operation_x", "common/reduction_operation.cl" }, - { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" }, - { "reduction_operation_y", "common/reduction_operation.cl" }, - { "reduction_operation_z", "common/reduction_operation.cl" }, - { "reduction_operation_w", "common/reduction_operation.cl" }, - { "reshape_layer", "common/reshape_layer.cl" }, - { "reshape_to_columns", "common/convolution_layer.cl" }, - { "reverse", "common/reverse.cl" }, - { "roi_align_layer", "common/roi_align_layer.cl" }, - { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" }, - { "roi_pooling_layer", "common/roi_pooling_layer.cl" }, - { "select_same_rank", "common/select.cl" }, - { "select_different_rank_2", "common/select.cl" }, - { "select_different_rank_n", "common/select.cl" }, - { "softmax_layer_norm", "common/softmax_layer.cl" }, - { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" }, - { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" }, - { "stack_layer", "common/stack_layer.cl" }, - { "strided_slice", "common/slice_ops.cl" }, - { "tile", "common/tile.cl" }, - { "transpose", "common/transpose.cl" }, -#ifdef ENABLE_NCHW_KERNELS - { "batch_to_space_nchw", "nchw/batch_to_space.cl" }, - { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" }, - { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" }, - { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" }, - { "depth_to_space_nchw", "nchw/depth_to_space.cl" }, - { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" }, - { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" }, - { "direct_convolution1x1_f32_bifrost", "nchw/direct_convolution1x1.cl" }, - { "direct_convolution3x3", "nchw/direct_convolution3x3.cl" }, - { "direct_convolution3x3_f32_bifrost", "nchw/direct_convolution3x3.cl" }, - { "direct_convolution5x5", "nchw/direct_convolution5x5.cl" }, - { "direct_convolution5x5_f32_bifrost", "nchw/direct_convolution5x5.cl" }, - { "direct_convolution_quantized", "nchw/direct_convolution_quantized.cl" }, - { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" }, - { "im2col3x3_nchw", "nchw/im2col.cl" }, - { "im2col5x5_nchw", "nchw/im2col.cl" }, - { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" }, - { "im2col_generic_nchw", "nchw/im2col.cl" }, - { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" }, - { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" }, - { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" }, - { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" }, - { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" }, - { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" }, - { "pooling_layer_2_nchw_indices_fp32", "nchw/pooling_layer.cl" }, - { "pooling_layer_2_nchw_indices_fp16", "nchw/pooling_layer.cl" }, - { "pooling_layer_MxN_quantized_nchw", "nchw/pooling_layer_quantized.cl" }, - { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" }, - { "remap_nearest_neighbour_nchw", "nchw/remap.cl" }, - { "remap_bilinear_nchw", "nchw/remap.cl" }, - { "reorg_layer_nchw", "nchw/reorg_layer.cl" }, - { "scale_nearest_neighbour_nchw", "nchw/scale.cl" }, - { "scale_bilinear_nchw", "nchw/scale.cl" }, - { "space_to_batch_nchw", "nchw/space_to_batch.cl" }, - { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" }, - { "space_to_depth_nchw", "nchw/space_to_depth.cl" }, - { "upsample_layer_nchw", "nchw/upsample_layer.cl" }, - { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" }, -#endif /* ENABLE_NCHW_KERNELS */ -#ifdef ENABLE_NHWC_KERNELS - { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" }, - { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" }, - { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" }, - { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" }, - { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" }, - { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" }, - { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" }, - { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" }, - { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" }, - { "im2col3x3_nhwc", "nhwc/im2col.cl" }, - { "im2col9x9_nhwc", "nhwc/im2col.cl" }, - { "im2col_generic_nhwc", "nhwc/im2col.cl" }, - { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" }, - { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" }, - { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" }, - { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" }, - { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" }, - { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" }, - { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" }, - { "remap_nearest_neighbour_nhwc", "nhwc/remap.cl" }, - { "remap_bilinear_nhwc", "nhwc/remap.cl" }, - { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" }, - { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" }, - { "scale_bilinear_nhwc", "nhwc/scale.cl" }, - { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" }, - { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" }, - { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" }, - { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" }, - { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" }, -#endif /* ENABLE_NHWC_KERNELS */ -}; - -const std::map ClKernelLibrary::_program_source_map = -{ -#ifdef EMBEDDED_KERNELS - { - "common/activation_layer.cl", -#include "./cl_kernels/common/activation_layer.clembed" - }, - { - "common/activation_layer_quant.cl", -#include "./cl_kernels/common/activation_layer_quant.clembed" - }, - { - "common/arg_min_max.cl", -#include "./cl_kernels/common/arg_min_max.clembed" - }, - { - "common/bitwise_op.cl", -#include "./cl_kernels/common/bitwise_op.clembed" - }, - { - "common/bounding_box_transform.cl", -#include "./cl_kernels/common/bounding_box_transform.clembed" - }, - { - "common/bounding_box_transform_quantized.cl", -#include "./cl_kernels/common/bounding_box_transform_quantized.clembed" - }, - { - "common/col2im.cl", -#include "./cl_kernels/common/col2im.clembed" - }, - { - "common/comparisons.cl", -#include "./cl_kernels/common/comparisons.clembed" - }, - { - "common/concatenate.cl", -#include "./cl_kernels/common/concatenate.clembed" - }, - { - "common/convert_fc_weights.cl", -#include "./cl_kernels/common/convert_fc_weights.clembed" - }, - { - "common/convolution_layer.cl", -#include "./cl_kernels/common/convolution_layer.clembed" - }, - { - "common/copy_tensor.cl", -#include "./cl_kernels/common/copy_tensor.clembed" - }, - { - "common/crop_tensor.cl", -#include "./cl_kernels/common/crop_tensor.clembed" - }, - { - "common/deconvolution_layer.cl", -#include "./cl_kernels/common/deconvolution_layer.clembed" - }, - { - "common/cast.cl", -#include "./cl_kernels/common/cast.clembed" - }, - { - "common/dequantization_layer.cl", -#include "./cl_kernels/common/dequantization_layer.clembed" - }, - { - "common/elementwise_operation.cl", -#include "./cl_kernels/common/elementwise_operation.clembed" - }, - { - "common/elementwise_operation_quantized.cl", -#include "./cl_kernels/common/elementwise_operation_quantized.clembed" - }, - { - "common/elementwise_unary.cl", -#include "./cl_kernels/common/elementwise_unary.clembed" - }, - { - "common/fft.cl", -#include "./cl_kernels/common/fft.clembed" - }, - { - "common/fft_digit_reverse.cl", -#include "./cl_kernels/common/fft_digit_reverse.clembed" - }, - { - "common/fft_scale.cl", -#include "./cl_kernels/common/fft_scale.clembed" - }, - { - "common/fill_border.cl", -#include "./cl_kernels/common/fill_border.clembed" - }, - { - "common/floor.cl", -#include "./cl_kernels/common/floor.clembed" - }, - { - "common/gather.cl", -#include "./cl_kernels/common/gather.clembed" - }, - { - "common/gemm.cl", -#include "./cl_kernels/common/gemm.clembed" - }, - { - "common/gemm_v1.cl", -#include "./cl_kernels/common/gemm_v1.clembed" - }, - { - "common/gemmlowp.cl", -#include "./cl_kernels/common/gemmlowp.clembed" - }, - { - "common/gemv.cl", -#include "./cl_kernels/common/gemv.clembed" - }, - { - "common/generate_proposals.cl", -#include "./cl_kernels/common/generate_proposals.clembed" - }, - { - "common/generate_proposals_quantized.cl", -#include "./cl_kernels/common/generate_proposals_quantized.clembed" - }, - { - "helpers.h", -#include "./cl_kernels/helpers.hembed" - }, - { - "helpers_asymm.h", -#include "./cl_kernels/helpers_asymm.hembed" - }, - { - "common/instance_normalization.cl", -#include "./cl_kernels/common/instance_normalization.clembed" - }, - { - "common/l2_normalize.cl", -#include "./cl_kernels/common/l2_normalize.clembed" - }, - { - "common/mean_stddev_normalization.cl", -#include "./cl_kernels/common/mean_stddev_normalization.clembed" - }, - { - "common/memset.cl", -#include "./cl_kernels/common/memset.clembed" - }, - { - "common/minmax_layer.cl", -#include "./cl_kernels/common/minmax_layer.clembed" - }, - { - "common/nonmax.cl", -#include "./cl_kernels/common/nonmax.clembed" - }, - { - "common/batchnormalization_layer.cl", -#include "./cl_kernels/common/batchnormalization_layer.clembed" - }, - { - "common/pad_layer.cl", -#include "./cl_kernels/common/pad_layer.clembed" - }, - { - "common/permute.cl", -#include "./cl_kernels/common/permute.clembed" - }, - { - "common/pixelwise_mul_float.cl", -#include "./cl_kernels/common/pixelwise_mul_float.clembed" - }, - { - "common/pixelwise_mul_int.cl", -#include "./cl_kernels/common/pixelwise_mul_int.clembed" - }, - { - "common/pooling_layer.cl", -#include "./cl_kernels/common/pooling_layer.clembed" - }, - { - "common/qlstm_layer_normalization.cl", -#include "./cl_kernels/common/qlstm_layer_normalization.clembed" - }, - { - "common/quantization_layer.cl", -#include "./cl_kernels/common/quantization_layer.clembed" - }, - { - "common/range.cl", -#include "./cl_kernels/common/range.clembed" - }, - { - "common/reduction_operation.cl", -#include "./cl_kernels/common/reduction_operation.clembed" - }, - { - "common/reshape_layer.cl", -#include "./cl_kernels/common/reshape_layer.clembed" - }, - { - "common/reverse.cl", -#include "./cl_kernels/common/reverse.clembed" - }, - { - "common/roi_align_layer.cl", -#include "./cl_kernels/common/roi_align_layer.clembed" - }, - { - "common/roi_align_layer_quantized.cl", -#include "./cl_kernels/common/roi_align_layer_quantized.clembed" - }, - { - "common/roi_pooling_layer.cl", -#include "./cl_kernels/common/roi_pooling_layer.clembed" - }, - { - "common/select.cl", -#include "./cl_kernels/common/select.clembed" - }, - { - "common/softmax_layer.cl", -#include "./cl_kernels/common/softmax_layer.clembed" - }, - { - "common/softmax_layer_quantized.cl", -#include "./cl_kernels/common/softmax_layer_quantized.clembed" - }, - { - "common/slice_ops.cl", -#include "./cl_kernels/common/slice_ops.clembed" - }, - { - "common/stack_layer.cl", -#include "./cl_kernels/common/stack_layer.clembed" - }, - { - "common/tile.cl", -#include "./cl_kernels/common/tile.clembed" - }, - { - "common/transpose.cl", -#include "./cl_kernels/common/transpose.clembed" - }, - { - "types.h", -#include "./cl_kernels/types.hembed" - }, - { - "common/unpooling_layer.cl", -#include "./cl_kernels/common/unpooling_layer.clembed" - }, -#ifdef ENABLE_NCHW_KERNELS - { - "nchw/batch_to_space.cl", -#include "./cl_kernels/nchw/batch_to_space.clembed" - }, - { - "nchw/channel_shuffle.cl", -#include "./cl_kernels/nchw/channel_shuffle.clembed" - }, - { - "nchw/upsample_layer.cl", -#include "./cl_kernels/nchw/upsample_layer.clembed" - }, - { - "nchw/depth_to_space.cl", -#include "./cl_kernels/nchw/depth_to_space.clembed" - }, - { - "nchw/dequantization_layer.cl", -#include "./cl_kernels/nchw/dequantization_layer.clembed" - }, - { - "nchw/direct_convolution1x1.cl", -#include "./cl_kernels/nchw/direct_convolution1x1.clembed" - }, - { - "nchw/direct_convolution3x3.cl", -#include "./cl_kernels/nchw/direct_convolution3x3.clembed" - }, - { - "nchw/direct_convolution5x5.cl", -#include "./cl_kernels/nchw/direct_convolution5x5.clembed" - }, - { - "nchw/direct_convolution_quantized.cl", -#include "./cl_kernels/nchw/direct_convolution_quantized.clembed" - }, - { - "nchw/im2col.cl", -#include "./cl_kernels/nchw/im2col.clembed" - }, - { - "nchw/normalization_layer.cl", -#include "./cl_kernels/nchw/normalization_layer.clembed" - }, - { - "nchw/normalize_planar_yuv_layer.cl", -#include "./cl_kernels/nchw/normalize_planar_yuv_layer.clembed" - }, - { - "nchw/normalize_planar_yuv_layer_quantized.cl", -#include "./cl_kernels/nchw/normalize_planar_yuv_layer_quantized.clembed" - }, - { - "nchw/batchnormalization_layer.cl", -#include "./cl_kernels/nchw/batchnormalization_layer.clembed" - }, - { - "nchw/pooling_layer.cl", -#include "./cl_kernels/nchw/pooling_layer.clembed" - }, - { - "nchw/pooling_layer_quantized.cl", -#include "./cl_kernels/nchw/pooling_layer_quantized.clembed" - }, - { - "nchw/prior_box_layer.cl", -#include "./cl_kernels/nchw/prior_box_layer.clembed" - }, - { - "nchw/remap.cl", -#include "./cl_kernels/nchw/remap.clembed" - }, - { - "nchw/reorg_layer.cl", -#include "./cl_kernels/nchw/reorg_layer.clembed" - }, - { - "nchw/scale.cl", -#include "./cl_kernels/nchw/scale.clembed" - }, - { - "nchw/space_to_batch.cl", -#include "./cl_kernels/nchw/space_to_batch.clembed" - }, - { - "nchw/space_to_depth.cl", -#include "./cl_kernels/nchw/space_to_depth.clembed" - }, - { - "nchw/winograd_filter_transform.cl", -#include "./cl_kernels/nchw/winograd_filter_transform.clembed" - }, - { - "nchw/winograd_input_transform.cl", -#include "./cl_kernels/nchw/winograd_input_transform.clembed" - }, - { - "nchw/winograd_output_transform.cl", -#include "./cl_kernels/nchw/winograd_output_transform.clembed" - }, -#endif /* ENABLE_NCHW_KERNELS */ - -#ifdef ENABLE_NHWC_KERNELS - { - "nhwc/batch_to_space.cl", -#include "./cl_kernels/nhwc/batch_to_space.clembed" - }, - { - "nhwc/channel_shuffle.cl", -#include "./cl_kernels/nhwc/channel_shuffle.clembed" - }, - { - "nhwc/upsample_layer.cl", -#include "./cl_kernels/nhwc/upsample_layer.clembed" - }, - { - "nhwc/depth_to_space.cl", -#include "./cl_kernels/nhwc/depth_to_space.clembed" - }, - { - "nhwc/dequantization_layer.cl", -#include "./cl_kernels/nhwc/dequantization_layer.clembed" - }, - { - "nhwc/direct_convolution.cl", -#include "./cl_kernels/nhwc/direct_convolution.clembed" - }, - { - "nhwc/dwc_native_fp_nhwc.cl", -#include "./cl_kernels/nhwc/dwc_native_fp_nhwc.clembed" - }, - { - "nhwc/dwc_native_quantized_nhwc.cl", -#include "./cl_kernels/nhwc/dwc_native_quantized_nhwc.clembed" - }, - { - "nhwc/normalization_layer.cl", -#include "./cl_kernels/nhwc/normalization_layer.clembed" - }, - { - "nhwc/normalize_planar_yuv_layer.cl", -#include "./cl_kernels/nhwc/normalize_planar_yuv_layer.clembed" - }, - { - "nhwc/normalize_planar_yuv_layer_quantized.cl", -#include "./cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.clembed" - }, - { - "nhwc/im2col.cl", -#include "./cl_kernels/nhwc/im2col.clembed" - }, - { - "nhwc/batchnormalization_layer.cl", -#include "./cl_kernels/nhwc/batchnormalization_layer.clembed" - }, - { - "nhwc/pooling_layer.cl", -#include "./cl_kernels/nhwc/pooling_layer.clembed" - }, - { - "nhwc/pooling_layer_quantized.cl", -#include "./cl_kernels/nhwc/pooling_layer_quantized.clembed" - }, - { - "nhwc/remap.cl", -#include "./cl_kernels/nhwc/remap.clembed" - }, - { - "nhwc/reorg_layer.cl", -#include "./cl_kernels/nhwc/reorg_layer.clembed" - }, - { - "nhwc/scale.cl", -#include "./cl_kernels/nhwc/scale.clembed" - }, - { - "nhwc/space_to_batch.cl", -#include "./cl_kernels/nhwc/space_to_batch.clembed" - }, - { - "nhwc/space_to_depth.cl", -#include "./cl_kernels/nhwc/space_to_depth.clembed" - }, - { - "nhwc/winograd_filter_transform.cl", -#include "./cl_kernels/nhwc/winograd_filter_transform.clembed" - }, - { - "nhwc/winograd_input_transform.cl", -#include "./cl_kernels/nhwc/winograd_input_transform.clembed" - }, - { - "nhwc/winograd_output_transform.cl", -#include "./cl_kernels/nhwc/winograd_output_transform.clembed" - }, -#endif /* ENABLE_NHWC_KERNELS */ -#endif /* EMBEDDED_KERNELS */ -}; - -ClKernelLibrary &ClKernelLibrary::get() -{ - static ClKernelLibrary _kernel_library; - return _kernel_library; -} - -std::string ClKernelLibrary::program_name(const std::string &kernel_name) const -{ - // Find which program contains the kernel - auto kernel_program_it = _kernel_program_map.find(kernel_name); - - if(_kernel_program_map.end() == kernel_program_it) - { - ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); - } - - const std::string program_name = kernel_program_it->second; - - return program_name; -} - -void ClKernelLibrary::set_kernel_path(std::string kernel_path) -{ - _kernel_path = std::move(kernel_path); - _kernel_path += "/"; -} - -const std::string &ClKernelLibrary::kernel_path() const -{ - return _kernel_path; -} - -ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &program_name) const -{ -#ifdef EMBEDDED_KERNELS -#ifdef ARM_COMPUTE_COMPRESSED_KERNELS - const auto inflatted_program_source_it = _decompressed_source_map.find(program_name); - if(inflatted_program_source_it != _decompressed_source_map.end()) - { - return ClProgramInfo{ inflatted_program_source_it->second, false }; - } -#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ - - const auto program_source_it = _program_source_map.find(program_name); - if(program_source_it == _program_source_map.end()) - { - ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str()); - } - std::string program_source = program_source_it->second; - -#ifdef ARM_COMPUTE_COMPRESSED_KERNELS - std::string decompressed_program_source = decompress_zlib(decode_base64(program_source_it->second)); - ARM_COMPUTE_ERROR_ON_MSG(decompressed_program_source.empty(), "Cannot de-compress requested program"); - _decompressed_source_map.insert(std::make_pair(program_name, decompressed_program_source)); - program_source = std::move(decompressed_program_source); -#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ - - return ClProgramInfo{ program_source, false }; -#else /* EMBEDDED_KERNELS */ - // Check for binary - std::string source_name = _kernel_path + program_name; - std::string binary_name = source_name + "bin"; - std::string program_source{}; - bool is_binary = false; - - if(std::ifstream(binary_name).is_open()) - { - program_source = read_file(binary_name, true); - is_binary = true; - } - else if(std::ifstream(source_name).is_open()) - { - program_source = read_file(source_name, false); - } - else - { - ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str()); - } - - return ClProgramInfo{ program_source, is_binary }; -#endif /* EMBEDDED_KERNELS */ -} -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/ClKernelLibrary.h b/src/core/gpu/cl/ClKernelLibrary.h deleted file mode 100644 index 42bec95032..0000000000 --- a/src/core/gpu/cl/ClKernelLibrary.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_KERNEL_LIBRARY_H -#define ARM_COMPUTE_CL_KERNEL_LIBRARY_H - -#include -#include -#include - -namespace arm_compute -{ -namespace opencl -{ -/** ClKernelLibrary contains all the OpenCL kernels that are used throughout the library - * - * @note Kernel library is a singleton to reduce memory requirements - * @note Sole responsibility is just to provide access to the kernel string, - * does not perform any compilation and relevant tasks - */ -class ClKernelLibrary final -{ -private: - /** Default Constructor */ - ClKernelLibrary() = default; - /** Prevent instances of this class from being copied */ - ClKernelLibrary(const ClKernelLibrary &) = delete; - /** Prevent instances of this class from being copied */ - const ClKernelLibrary &operator=(const ClKernelLibrary &) = delete; - -public: - /** Structure to encapsulte program related information */ - struct ClProgramInfo - { - std::string program{}; /**< Program raw string */ - bool is_binary{ false }; /**< Flag that indicates if is in binary format */ - }; - -public: - /** Access the KernelLibrary singleton - * - * @return The KernelLibrary instance - */ - static ClKernelLibrary &get(); - /** Sets the path that the kernels reside in - * - * @param[in] kernel_path Path of the kernel - */ - void set_kernel_path(std::string kernel_path); - /** Gets the path that the kernels reside in - */ - const std::string &kernel_path() const; - /** Gets the source of the selected program - * - * @param[in] program_name Program name - * - * @return A pair with the source (false) or the binary (true), of the selected program - */ - ClProgramInfo program(const std::string &program_name) const; - /** Returns the program name given a kernel name - * - * @return Program name - */ - std::string program_name(const std::string &kernel_name) const; - -private: - std::string _kernel_path{}; /**< Path to the kernels folder. */ - mutable std::map _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */ - static const std::map _kernel_program_map; /**< Map that associates kernel names with programs. */ - static const std::map _program_source_map; /**< Contains sources for all programs. - Used for compile-time kernel inclusion. >*/ -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_KERNEL_LIBRARY_H */ diff --git a/src/core/gpu/cl/IClKernel.h b/src/core/gpu/cl/IClKernel.h deleted file mode 100644 index 52ea3c9183..0000000000 --- a/src/core/gpu/cl/IClKernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_ICL_KERNEL_H -#define ARM_COMPUTE_ICL_KERNEL_H - -#include "arm_compute/core/ITensorInfo.h" -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -using IClKernel = arm_compute::ICLKernel; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_ICL_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClActivationKernel.cpp b/src/core/gpu/cl/kernels/ClActivationKernel.cpp deleted file mode 100644 index 21c05632f9..0000000000 --- a/src/core/gpu/cl/kernels/ClActivationKernel.cpp +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClActivationKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" - -#include "support/StringSupport.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32); - - static std::set quantized_supported_activations = - { - ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH, - ActivationLayerInfo::ActivationFunction::LEAKY_RELU, - }; - const DataType data_type = src->data_type(); - const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); - const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0), - "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); - - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128))); - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128))); - - // Checks performed when destination is configured - if((dst != nullptr) && (dst->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - return Status{}; -} -} // namespace - -ClActivationKernel::ClActivationKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src); - - auto padding_info = get_padding_info({ src, dst }); - - _run_in_place = (dst == nullptr) || (dst == src); - - if(dst != nullptr) - { - // Destination auto inizialitation if not yet initialized - auto_init_if_empty(*dst, *src->clone()); - } - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, (dst != nullptr) ? dst : nullptr, act_info)); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0)); - - const DataType dt = src->data_type(); - float a_const = act_info.a(); - float b_const = act_info.b(); - - const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); - const bool is_quantized = is_data_type_quantized(dt); - const bool perform_activation_in_float = - (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - || (f_act == ActivationLayerInfo::ActivationFunction::TANH) - || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option_if(perform_activation_in_float, "-DFLOAT_DOMAIN"); - build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); - build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - - std::string kernel_name = std::string("activation_layer"); - - // Set quantization info build options - if(is_quantized) - { - const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); - - if(!perform_activation_in_float) - { - int a_const_int = 0; - int b_const_int = 0; - - // Create quantized version of constants a, b if needed - switch(dt) - { - case DataType::QASYMM8: - { - a_const_int = quantize_qasymm8(a_const, iq_info); - b_const_int = quantize_qasymm8(b_const, iq_info); - } - break; - case DataType::QASYMM8_SIGNED: - { - a_const_int = quantize_qasymm8_signed(a_const, iq_info); - b_const_int = quantize_qasymm8_signed(b_const, iq_info); - } - break; - case DataType::QSYMM16: - { - a_const_int = quantize_qsymm16(a_const, iq_info); - b_const_int = quantize_qsymm16(b_const, iq_info); - } - break; - default: - break; - } - build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int))); - build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int))); - } - else - { - build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const))); - build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const))); - } - - // Quantized value of 0 corresponds to the offset o1 - build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0"))); - build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale))); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset)); - - // Set correct kernel name - kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant"); - - // Set scale and offset of the source and destination if they have different quantization info - if(dst != nullptr) - { - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - if(iq_info != oq_info) - { - build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale))); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset)); - } - } - } - else - { - // Set A, B constants in build options for float types - build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const))); - build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const))); - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = "activation_layer_"; - _config_id += lower_string(string_from_data_type(dt)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(1)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info)); - return Status{}; -} - -void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - if(!_run_in_place) - { - add_3D_tensor_argument(idx, dst, slice); - } - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClActivationKernel.h b/src/core/gpu/cl/kernels/ClActivationKernel.h deleted file mode 100644 index 720b16a691..0000000000 --- a/src/core/gpu/cl/kernels/ClActivationKernel.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_ACTIVATION_KERNEL_H -#define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the activation kernel. */ -class ClActivationKernel : public IClKernel -{ -public: - ClActivationKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClActivationKernel); - /** Configure kernel for a given list of arguments - * - * @note If the output tensor is a nullptr, the activation function will be performed in-place - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] src Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result - * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. - * @param[out] dst Destination tensor info. Data type supported: same as @p src - * @param[in] act_info Activation layer information. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClActivationKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; - -private: - bool _run_in_place{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_ACTIVATION_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp deleted file mode 100644 index fba1b0e087..0000000000 --- a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY)); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ)); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst); - - return Status{}; -} -} // namespace - -ClBatchConcatenateKernel::ClBatchConcatenateKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst)); - - auto padding_info = get_padding_info({ src, dst }); - - _batch_offset = batch_offset; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0)); - - // Add build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) - { - const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); - build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); - } - - // Create kernel - _kernel = create_kernel(compile_context, "concatenate", build_opts.options()); - - // Configure kernel window - auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - win.set(3, Window::Dimension(0, src->tensor_shape()[3], 1)); - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = "concatenate_"; - _config_id += support::cpp11::to_string(3); - _config_id += "_"; - _config_id += support::cpp11::to_string(batch_offset); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(3)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClBatchConcatenateKernel::validate(const arm_compute::ITensorInfo *src, - unsigned int batch_offset, - const arm_compute::ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst)); - return Status{}; -} - -void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window slice = window.first_slice_window_3D(); - - const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3]; - - unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters - _kernel.setArg(idx, offset_to_first_elements_in_bytes); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace opencl -} // namespace kernels -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h deleted file mode 100644 index 2963d7cdfd..0000000000 --- a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H -#define ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the batch concatenate kernel. - * The src tensor will be concatenated into the destination tensor. - */ -class ClBatchConcatenateKernel : public IClKernel -{ -public: - ClBatchConcatenateKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClBatchConcatenateKernel); - /** Initialise the kernel's source and destination - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: All. - * @param[in] batch_offset The offset on axis # 3. - * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. - * - * @note: The dst tensor's low two dimensions can't be smaller than the src one's. - * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2. - * - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClBatchConcatenateKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; - -private: - unsigned int _batch_offset{ 0 }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClCastKernel.cpp b/src/core/gpu/cl/kernels/ClCastKernel.cpp deleted file mode 100644 index fac9ebe5cf..0000000000 --- a/src/core/gpu/cl/kernels/ClCastKernel.cpp +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClCastKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_UNUSED(policy); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src == dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, - 1, - DataType::U8, DataType::S8, DataType::QSYMM8_PER_CHANNEL, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different"); - - // Validate in case of configured dst - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} -} // namespace - -ClCastKernel::ClCastKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given) - set_shape_if_empty(*dst, src->tensor_shape()); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy)); - - auto padding_info = get_padding_info({ src, dst }); - - // Get data sizes - const size_t src_size = data_size_from_type(src->data_type()); - const size_t dst_size = data_size_from_type(dst->data_type()); - - // Get number of elements to process per iterations - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0)); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); - // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined - build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE"); - build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT"); - build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED"); - - // Create kernel - const std::string kernel_name = (src_size >= dst_size) ? "cast_down" : "cast_up"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - // Collapse window - const Window &full_window = window(); - Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ); - ICLKernel::configure_internal(collapsed_window); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(src->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); -} - -Status ClCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy)); - return Status{}; -} - -void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClCastKernel.h b/src/core/gpu/cl/kernels/ClCastKernel.h deleted file mode 100644 index 6bf3cd9e50..0000000000 --- a/src/core/gpu/cl/kernels/ClCastKernel.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_CAST_KERNEL_H -#define ARM_COMPUTE_CL_CAST_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Casts a given tensor to a new type - * - * @note When casting between quantized types the scale and zeroPoint are ignored - */ -class ClCastKernel : public IClKernel -{ -public: - ClCastKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCastKernel); - /** Set the src and dst of the kernel. - * - * Valid conversions src -> dst : - * - * - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data) - * - U8 -> S8, U16, S16, U32, S32, F16, F32 - * - U16 -> U8, S8, S16, U32, S32, F16, F32 - * - S16 -> U8, S8, U16, U32, S32, F16, F32 - * - U32 -> U8, S8, U16, S16, S32, F16, F32 - * - S32 -> U8, S8, U16, S16, U32, F16, F32 - * - F16 -> U8, S8, U16, S16, U32, F32 - * - F32 -> U8, S8, U16, S16, U32, F16 - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The source tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32. - * @param[out] dst The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. - * @param[in] policy Conversion policy - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClCastKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_CAST_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/core/gpu/cl/kernels/ClCol2ImKernel.cpp deleted file mode 100644 index a3d57115f9..0000000000 --- a/src/core/gpu/cl/kernels/ClCol2ImKernel.cpp +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClCol2ImKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -#include - -namespace arm_compute -{ -using namespace misc::shape_calculator; -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - - // Checks performed when output is configured - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW"); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW)); - - constexpr unsigned int num_elems_read_per_iteration = 8; - - // Configure window - Window win = calculate_max_window(*src, Steps(num_elems_read_per_iteration)); - - // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one - AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration); - bool window_changed = update_window_and_padding(win, input_access); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} -} // namespace - -ClCol2ImKernel::ClCol2ImKernel() - : _convolved_dims() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims, num_groups)); - - _convolved_dims = convolved_dims; - - const DataType data_type = src->data_type(); - - // Create kernel - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size())); - build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(src->dimension(0))); - build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width)); - build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups)); - - _kernel = create_kernel(compile_context, "col2im", build_opts.options()); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, _convolved_dims, num_groups); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - IClKernel::configure_internal(win_config.second); - - // Set config_id for enabling LWS tuning - _config_id = "col2im_"; - _config_id += lower_string(string_from_data_type(src->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(num_groups); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); -} - -Status ClCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first); - return Status{}; -} - -void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window); - - bool is_collapsed = false; - bool is_collapsed_out = false; - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window out_window; - out_window.use_tensor_dimensions(dst->info()->tensor_shape()); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed); - Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out); - - ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out); - - Window slice = collapsed.first_slice_window_3D(); - Window slice_out = collapsed_out.first_slice_window_4D(); - do - { - // Set inputs - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_4D_tensor_argument(idx, dst, slice_out); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClCol2ImKernel.h b/src/core/gpu/cl/kernels/ClCol2ImKernel.h deleted file mode 100644 index 74a9027628..0000000000 --- a/src/core/gpu/cl/kernels/ClCol2ImKernel.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_COL2IM_KERNEL_H -#define ARM_COMPUTE_CL_COL2IM_KERNEL_H - -#include "arm_compute/core/Size2D.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the col2im reshaping kernel. - * - * Rearranges each matrix column into image blocks. It's the inverse operation of @ref opencl::kernels::ClIm2ColKernel. - * - * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3: - * - * @f[ - * \left( \begin{array}{ccccccccc} - * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\ - * \end{array} \right) - * \rightarrow - * \left( \begin{array}{ccc} - * a0 & a1 & a2 \\ - * a3 & a4 & a5 \\ - * a6 & a7 & a8 \\ - * \end{array} \right) - * @f] - */ -class ClCol2ImKernel : public IClKernel -{ -public: - /** Default constructor */ - ClCol2ImKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCol2ImKernel); - /** Set the input and output of the kernel. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The input tensor info to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[out] dst The output tensor info. 3 lower dimensions represent a single output [width, height, OFM], - * while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW - * @param[in] convolved_dims Output convolved dimensions. - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClCol2ImKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -public: - Size2D _convolved_dims; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /*ARM_COMPUTE_CL_COL2IM_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp deleted file mode 100644 index d1abd274d6..0000000000 --- a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, - DataLayout data_layout) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output tensor auto initialisation if not yet initialized - auto_init_if_empty(*dst, *src->clone()); - - auto padding_info = get_padding_info({ src, dst }); - - ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout)); - - const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW; - - const int width_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::HEIGHT); - const int channel_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::CHANNEL); - - const unsigned int num_elems_per_src_plane = original_src_shape[width_idx] * original_src_shape[height_idx]; - const unsigned int num_channels = original_src_shape[channel_idx]; - - const unsigned int factor_1 = (data_layout == DataLayout::NCHW) ? num_elems_per_src_plane : num_channels; - const unsigned int factor_2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_src_plane; - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); - build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1)); - build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2)); - - // Create kernel - _kernel = create_kernel(compile_context, "convert_fc_weights", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, - DataLayout data_layout) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_src_shape.total_size_lower(3)); - ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN); - - // Checks performed when dst is configured - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} - -void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - unsigned int idx = 0; - add_2D_tensor_argument(idx, src, window); - add_2D_tensor_argument(idx, dst, window); - enqueue(queue, *this, window, lws_hint()); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h deleted file mode 100644 index 3976fd45db..0000000000 --- a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H -#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa. - * - * @note This function can be applied to the 2D weights used by a Fully Connected layer if: - * - It follows a Convolution layer - * - The data layout used by the network does not match the one the model has been trained in. - * - * @note This function assumes the weights are already reshaped (transposed) - */ -namespace opencl -{ -namespace kernels -{ -class ClConvertFullyConnectedWeightsKernel : public IClKernel -{ -public: - ClConvertFullyConnectedWeightsKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClConvertFullyConnectedWeightsKernel); - /** Set the src and dst tensor. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All. - * @param[out] dst The converted weights tensor info. Shape and Data Type: Same as @p src. - * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). - * @param[in] data_layout The data layout the weights have been trained in. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClCopyKernel.cpp b/src/core/gpu/cl/kernels/ClCopyKernel.cpp deleted file mode 100644 index 98c6f34e60..0000000000 --- a/src/core/gpu/cl/kernels/ClCopyKernel.cpp +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClCopyKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - - // Validate dst if initialized - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - if(dst_window == nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst_window->shape()); - } - } - - return Status{}; -} - -} // namespace - -ClCopyKernel::ClCopyKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window)); - - auto padding_info = get_padding_info({ src, dst }); - - // Create kernel - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, *src); - - // Configure window - const unsigned int vec_size_x = adjust_vec_size(16 / src->element_size(), src->dimension(0)); - - const Window win_config = calculate_max_window(*src, Steps(vec_size_x)); - - if(dst_window != nullptr) - { - _has_dst_window = true; - _dst_window = Window(*dst_window); - const int width_x = dst_window->num_iterations(0); - const int vec_size_x_leftover = width_x % vec_size_x; - const bool multi_access_x = width_x >= static_cast(vec_size_x); - - if(multi_access_x) - { - _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x)); - } - - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover)); - } - else - { - const int width_x = src->tensor_shape().x(); - const int vec_size_x_leftover = width_x % vec_size_x; - - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover)); - } - - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - - // Build kernel - _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options()); - - // Validate and set the window - ICLKernel::configure_internal(win_config); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window)); - - return Status{}; -} - -void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window slice; - - if(_has_dst_window) - { - slice = window.first_slice_window_3D(); - Window out_slice = _dst_window.first_slice_window_3D(); - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, out_slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice)); - } - else - { - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - slice = collapsed.first_slice_window_3D(); - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); - } -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClCopyKernel.h b/src/core/gpu/cl/kernels/ClCopyKernel.h deleted file mode 100644 index d2732c4e59..0000000000 --- a/src/core/gpu/cl/kernels/ClCopyKernel.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_COPY_KERNEL_H -#define ARM_COMPUTE_CL_COPY_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to perform a copy between two tensors */ -class ClCopyKernel : public IClKernel -{ -public: - ClCopyKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCopyKernel); - /** Initialize the kernel's src, dst. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: All. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClCopyKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - Window _dst_window{}; - bool _has_dst_window{}; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_COPY_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClCropKernel.cpp b/src/core/gpu/cl/kernels/ClCropKernel.cpp deleted file mode 100644 index a052ef53f9..0000000000 --- a/src/core/gpu/cl/kernels/ClCropKernel.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClCropKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "src/core/CPP/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -ClCropKernel::ClCropKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, - float extrapolation_value, Window *dst_window) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window)); - - _start = start; - _batch_index = batch_index; - _extrapolation_value = extrapolation_value; - - const int vec_size_x = 4; - // Create and update the window (if needed) - Window win = calculate_max_window(*dst); - - if(dst_window != nullptr) - { - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window); - win = *dst_window; - } - - const int dst_width_x = win.num_iterations(0); - const bool multi_access_x = dst_width_x >= vec_size_x; - const bool remainder_x = dst_width_x % vec_size_x > 0; - - if(multi_access_x) - { - win.set(Window::DimX, - Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); - } - ICLKernel::configure_internal(win); - - // Create kernel - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(dst_width_x - vec_size_x, 0))); - build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED="); - build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED="); - _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options()); -} - -Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) -{ - ARM_COMPUTE_UNUSED(extrapolation_value, dst_window); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); - ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4); - ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0); - ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast(src->dimension(1)) || start.y >= static_cast(src->dimension(2)) - || end.x >= static_cast(src->dimension(1)) || end.y >= static_cast(src->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3)); - if(dst_window != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1); - } - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(dst->num_dimensions() > 3); - } - return Status{}; -} - -void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window in_slice = Window(); - in_slice.use_tensor_dimensions(src->info()->tensor_shape()); - in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step())); - in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1)); - - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, in_slice); - add_3D_tensor_argument(idx, dst, window); - add_argument(idx, _start.x); - add_argument(idx, _start.y); - enqueue(queue, *this, window, lws_hint()); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClCropKernel.h b/src/core/gpu/cl/kernels/ClCropKernel.h deleted file mode 100644 index d81912284e..0000000000 --- a/src/core/gpu/cl/kernels/ClCropKernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_CROP_KERNEL_H -#define ARM_COMPUTE_CL_CROP_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to perform a copy between two tensors */ -class ClCropKernel : public IClKernel -{ -public: - ClCropKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCropKernel); - /** Configure kernel - * - * @note Supported tensor rank: up to 4 - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC. - * @param[out] dst Destination tensor info. Data type supported: F32 - * @param[in] start Coordinates of where to start cropping the image. - * @param[in] end Coordinates of where to end cropping the image. - * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src. - * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. - * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClCropKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - Coordinates2D _start{}; - uint32_t _batch_index{}; - float _extrapolation_value{}; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_CROP_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp deleted file mode 100644 index e3e384f748..0000000000 --- a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY)); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(2) + depth_offset > dst->dimension(2)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, src, dst); - - return Status{}; -} -} // namespace - -ClDepthConcatenateKernel::ClDepthConcatenateKernel() - : _depth_offset(0) -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst)); - - auto padding_info = get_padding_info({ src, dst }); - - _depth_offset = depth_offset; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0)); - - // Add build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) - { - const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); - build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); - } - - // Create kernel - _kernel = create_kernel(compile_context, "concatenate", build_opts.options()); - - // Configure kernel window - auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - win.set(Window::DimZ, Window::Dimension(0, src->tensor_shape().z(), 1)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClDepthConcatenateKernel::validate(const arm_compute::ITensorInfo *src, - unsigned int depth_offset, - const arm_compute::ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst)); - return Status{}; -} - -void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window slice = window.first_slice_window_3D(); - - const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2]; - - unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters - _kernel.setArg(idx, offset_to_first_elements_in_bytes); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h deleted file mode 100644 index 0f408477b1..0000000000 --- a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H -#define ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the depth concatenate kernel. - * The src tensor will be concatenated into the dst tensor. - */ -class ClDepthConcatenateKernel : public IClKernel -{ -public: - ClDepthConcatenateKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDepthConcatenateKernel); - /** Initialise the kernel's source and destination - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] depth_offset The offset on the Z axis. - * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. - * - * @note: The dst tensor's low two dimensions can't be smaller than the src one's. - * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2. - * - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClDepthConcatenateKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; - -private: - unsigned int _depth_offset; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp deleted file mode 100644 index d69da8716c..0000000000 --- a/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClDequantizeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" - -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16); - - if(dst->tensor_shape().total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} -} // namespace - -ClDequantizeKernel::ClDequantizeKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32); - - auto padding_info = get_padding_info({ src, dst }); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - - const int vec_size_x = 16 / dst->element_size(); - const int output_width_x = dst->tensor_shape().x(); - const bool multi_access_x = (output_width_x / vec_size_x > 0); - - const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->data_type()); - std::string kernel_name = "dequantization_layer"; - - // Create kernel - CLBuildOptions build_opts; - if(!is_quantized_per_channel) - { - const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); - const int qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0; - build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); - build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset)); - } - else - { - kernel_name += "_per_channel"; - kernel_name += src->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc"; - } - - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(output_width_x - vec_size_x, 0))); - - // Create kernel name - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*dst); - if(multi_access_x) - { - win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); - } - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); - return Status{}; -} - -void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type()); - - // Collapse windo - Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3); - Window slice = new_window.first_slice_window_3D(); - - if(is_quantized_per_channel) - { - unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters - _kernel.setArg(idx++, src->quantization().scale->cl_buffer()); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(new_window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.h b/src/core/gpu/cl/kernels/ClDequantizeKernel.h deleted file mode 100644 index 0912e1b228..0000000000 --- a/src/core/gpu/cl/kernels/ClDequantizeKernel.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H -#define ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the dequantization layer kernel. */ -class ClDequantizeKernel : public IClKernel -{ -public: - ClDequantizeKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDequantizeKernel); - /** Initialise the kernel's input and output - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. - * @param[out] dst Destination tensor info. Data types supported: F16/F32. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClDequantizeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp deleted file mode 100644 index 7b98671da2..0000000000 --- a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ /dev/null @@ -1,672 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLUtils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - - const DataLayout data_layout = src->data_layout(); - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), - "Weights feature map dimension should match the respective src's one"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) - && std::get<0>(conv_info.stride()) > 2, - "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout != DataLayout::NHWC && !is_data_type_float(src->data_type()) && act_info.enabled(), - "Activation supported only for floating point and NHWC."); - - if(data_layout == DataLayout::NCHW) - { - if(is_data_type_quantized(src->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, - "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5, - "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types"); - } - } - - if(biases != nullptr) - { - if(is_data_type_quantized_asymmetric(src->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3), - "Biases size and number of src feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, - "Biases should be one dimensional"); - } - - // Checks performed when dst is configured - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - const auto data_type = src->data_type(); - if(is_data_type_quantized(data_type)) - { - const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int output_multiplier = 0; - int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - } - return Status{}; -} - -inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size, - DataType data_type, DataLayout data_layout) -{ - return gpu_target_is_in(gpu_target, - GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, - GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, - GPUTarget::G52, GPUTarget::G52LIT) - && (kernel_size <= 5) - && (conv_stride_x == 1) && (conv_stride_y == 1) - && (data_type == DataType::F32) - && (data_layout == DataLayout::NCHW); -} - -inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y, - unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y, - unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src) -{ - const DataType data_type = src->data_type(); - const DataLayout data_layout = src->data_layout(); - unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - - const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout); - - if(run_optimized_bifrost) - { - // Configure kernel window - switch(kernel_size) - { - case 1: - { - num_elems_read_per_iteration_x = 4; - num_elems_read_per_iteration_y = 4; - num_elems_written_per_iteration_x = 4; - num_elems_written_per_iteration_y = 4; - break; - } - case 3: - { - num_elems_read_per_iteration_x = 6; - num_elems_read_per_iteration_y = 5; - num_elems_written_per_iteration_x = 4; - num_elems_written_per_iteration_y = 3; - break; - } - case 5: - { - num_elems_read_per_iteration_x = 8; - num_elems_read_per_iteration_y = 6; - num_elems_written_per_iteration_x = 4; - num_elems_written_per_iteration_y = 2; - break; - } - default: - { - ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost"); - } - } - } - else - { - num_elems_read_per_iteration_y = kernel_size; - num_elems_written_per_iteration_x = 8; - num_elems_written_per_iteration_y = 1; - switch(kernel_size) - { - case 1: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 8; - break; - case 2: - num_elems_read_per_iteration_x = 16; - break; - case 3: - switch(src->element_size()) - { - case 1: - num_elems_read_per_iteration_x = 28; - break; - case 2: - num_elems_read_per_iteration_x = 24; - break; - case 4: - num_elems_read_per_iteration_x = 22; - break; - default: - ARM_COMPUTE_ERROR("Invalid data size"); - } - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 3: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 10; - break; - case 2: - num_elems_read_per_iteration_x = 17; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 5: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 12; - break; - case 2: - num_elems_read_per_iteration_x = 20; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - case 9: - switch(conv_stride_x) - { - case 1: - num_elems_read_per_iteration_x = 16; - break; - case 2: - num_elems_read_per_iteration_x = 24; - break; - default: - ARM_COMPUTE_ERROR("Invalid convolution stride X"); - } - break; - default: - ARM_COMPUTE_ERROR("Invalid direct convolution size"); - } - } -} - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target) -{ - const DataLayout data_layout = src->data_layout(); - - // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, output_shape, - 1, - src->data_type(), - src->quantization_info()); - - if(data_layout == DataLayout::NHWC) - { - const unsigned int vec_size = std::min(static_cast(dst->tensor_shape()[0]), 4u); - unsigned int num_rows = 1U; - if(dst->tensor_shape()[0] > 16) - { - num_rows = src->data_type() == DataType::F32 ? 2U : 4U; - } - - // Create window and update padding - Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); - return std::make_pair(Status{}, win); - } - else if(data_layout == DataLayout::NCHW) - { - const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int kernel_size = weights->dimension(width_idx); - - unsigned int num_elems_read_per_iteration_x = 0; - unsigned int num_elems_read_per_iteration_y = 0; - unsigned int num_elems_written_per_iteration_x = 0; - unsigned int num_elems_written_per_iteration_y = 0; - - unsigned int conv_pad_left = conv_info.pad_left(); - unsigned int conv_pad_top = conv_info.pad_top(); - unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - - setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, - num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, - kernel_size, conv_info, target, src); - - // Create window and update padding - bool window_changed = false; - Window win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y)); - - AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y); - AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size); - AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y); - window_changed = update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); - } - else - { - ARM_COMPUTE_ERROR("Not supported"); - } -} - -bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout) -{ - if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) - { - return false; - } - - // If not floating point - if(!is_data_type_float(tensor->data_type())) - { - return false; - } - - if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - { - return false; - } - - // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform - if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) - { - return false; - } - - // Check cl image pitch alignment - if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) - { - return false; - } - - const size_t image_w = tensor->tensor_shape()[0] / 4; - const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; - const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); - const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); - - if(image_w > max_image_w || image_h > max_image_h) - { - return false; - } - - return true; -} - -} // namespace - -BorderSize ClDirectConv2dKernel::border_size() const -{ - return _border_size; -} - -ClDirectConv2dKernel::ClDirectConv2dKernel() -{ - _type = CLKernelType::DIRECT; -} - -void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - - // Perform validation - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info)); - - const int conv_stride_x = std::get<0>(conv_info.stride()); - const int conv_stride_y = std::get<1>(conv_info.stride()); - - _data_layout = src->data_layout(); - _conv_info = conv_info; - - const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const unsigned int kernel_size = weights->dimension(width_idx); - const DataType data_type = src->data_type(); - - const GPUTarget gpu_target = get_target(); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - std::stringstream kernel_name; - CLBuildOptions build_options; - - if(_data_layout == DataLayout::NHWC) - { - _border_size = BorderSize(); - - kernel_name << "direct_convolution_nhwc"; - - const unsigned int n0 = win_config.second.x().step(); - const unsigned int m0 = win_config.second.y().step(); - const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx)); - const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; - const unsigned int pad_left = conv_info.pad_left(); - const unsigned int pad_top = conv_info.pad_top(); - const bool export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout); - - // Update the padding for the weights tensor if we can export to cl_image - if(export_to_cl_image) - { - gemm::update_padding_for_cl_image(weights); - } - - if(biases != nullptr) - { - build_options.add_option(std::string("-DHAS_BIAS")); - build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); - } - - build_options.add_option("-cl-fast-relaxed-math"); - build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER"); - build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx))); - build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx))); - build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx))); - build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_options.add_option("-DDST_TENSOR_TYPE=BUFFER"); - build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx))); - build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx))); - build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx))); - build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); - build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); - build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); - build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type())); - build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)); - build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y)); - build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left)); - build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top)); - build_options.add_option("-DN0=" + support::cpp11::to_string(n0)); - build_options.add_option("-DM0=" + support::cpp11::to_string(m0)); - build_options.add_option("-DK0=" + support::cpp11::to_string(k0)); - build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); - build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); - - if(is_data_type_quantized(data_type)) - { - const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - - PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info()); - int zero_value_s32; - zero_value.get(zero_value_s32); - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); - build_options.add_option("-DIS_QUANTIZED"); - build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift)); - build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); - build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); - build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); - build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32)); - build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32)); - } - else - { - build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0)); - build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0)); - build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0)); - build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0)); - build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); - build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); - } - } - else - { - _border_size = BorderSize(src->padding()); - - kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; - - build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS")); - - const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout); - - if(run_optimized_for_bifrost) - { - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); - - kernel_name << "_f32_bifrost"; - } - else - { - build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); - build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); - build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x))); - build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); - - if(is_data_type_quantized(data_type)) - { - const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - - float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); - build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); - build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)); - build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); - build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); - build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); - - kernel_name.str("direct_convolution_quantized"); - } - } - } - - _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name.str(); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(data_type)); - _config_id += "_"; - _config_id += support::cpp11::to_string(kernel_size); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_size().left); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_size().top); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_size().right); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_size().bottom); - _config_id += "_"; - _config_id += support::cpp11::to_string(conv_stride_x); - _config_id += "_"; - _config_id += support::cpp11::to_string(conv_stride_y); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(width_idx)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(height_idx)); - _config_id += "_"; - _config_id += lower_string(string_from_data_layout(_data_layout)); -} - -Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first); - - return Status{}; -} - -void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - // Get initial windows - Window slice = window.first_slice_window_3D(); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - if(_data_layout == DataLayout::NHWC) - { - cl::Image2D weights_cl_image; - - const size_t dim_y_collapsed = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step()); - const bool export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout); - - slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step())); - slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1)); - - if(export_to_cl_image) - { - const size_t image_w = weights->info()->dimension(0) / 4; - const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); - const TensorShape shape2d(image_w, image_h); - const size_t image_row_pitch = weights->info()->strides_in_bytes()[1]; - - // Export cl_buffer to cl_image - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch); - } - - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, slice); - add_4D_tensor_argument(idx, dst, slice); - if(export_to_cl_image) - { - _kernel.setArg(idx++, weights_cl_image); - } - add_4D_tensor_argument(idx, weights, slice); - if(biases != nullptr) - { - add_1D_tensor_argument(idx, biases, slice); - } - enqueue(queue, *this, slice, lws_hint()); - } - else - { - Window win_in = window; - - win_in.adjust(Window::DimX, -_conv_info.pad_left(), true); - win_in.adjust(Window::DimY, -_conv_info.pad_top(), true); - - const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - const int conv_stride_x = std::get<0>(_conv_info.stride()); - const int conv_stride_y = std::get<1>(_conv_info.stride()); - - win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x); - win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y); - - Window slice_in = win_in.first_slice_window_3D(); - unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); - add_3D_tensor_argument(idx1, weights, slice); - - if(biases != nullptr) - { - Window slice_biases; - slice_biases.use_tensor_dimensions(biases->info()->tensor_shape()); - add_1D_tensor_argument(idx1, biases, slice_biases); - } - - _kernel.setArg(idx1++, static_cast(weights->info()->strides_in_bytes()[3])); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice_in); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); - } -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h deleted file mode 100644 index 4880d4a668..0000000000 --- a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H -#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the direct convolution kernel. */ -class ClDirectConv2dKernel : public IClKernel -{ -public: - ClDirectConv2dKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel); - /** Set the src, weights, biases and dst tensors info. - * - * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed. - * - * @note: DirectConvolution only works in the following configurations: - * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 - * 3x3 convolution with stride_x = 1/2, stride_y = 1/2 - * 5x5 convolution with stride_x = 1/2, stride_y = 1/2 - * 9x9 convolution with stride_x = 1/2, stride_y = 1/2 - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * The 3rd dimension must be the same as the src's volume 3rd dimension. - * Data type supported:Same as @p src. - * @param[in] biases Biases tensor info. Biases are 1D tensor with dimension [OFM]. - * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type - * @param[out] dst Output tensor info. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info Contains activaton information described in @ref ActivationLayerInfo. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClDirectConv2dKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -public: - DataLayout _data_layout{}; - BorderSize _border_size{}; - PadStrideInfo _conv_info{}; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp deleted file mode 100644 index 3d9f0b6fcf..0000000000 --- a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp +++ /dev/null @@ -1,525 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "src/common/utils/Validate.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -constexpr unsigned int vector_size_byte_opencl = 16; - -std::map supported_arithmetic_ops = -{ - { ArithmeticOperation::ADD, "ADD" }, - { ArithmeticOperation::SUB, "SUB" }, - { ArithmeticOperation::DIV, "DIV" }, - { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" }, - { ArithmeticOperation::MIN, "MIN" }, - { ArithmeticOperation::MAX, "MAX" }, - { ArithmeticOperation::POWER, "POWER" }, - { ArithmeticOperation::PRELU, "PRELU" }, -}; - -std::map supported_sat_arithmetic_ops = -{ - { ArithmeticOperation::ADD, "ADD" }, - { ArithmeticOperation::SUB, "SUB" }, -}; - -std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) -{ - std::string config_id; - // Set config_id for enabling LWS tuning - config_id = kernel_name; - config_id += "_"; - config_id += lower_string(string_from_data_type(src1.data_type())); - config_id += "_"; - config_id += support::cpp11::to_string(dst.dimension(0)); - config_id += "_"; - config_id += support::cpp11::to_string(dst.dimension(1)); - return config_id; -} - -Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape) -{ - if(in_place) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0), - "Wrong shape for dst, cannot do in_place calculation"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), - "Wrong shape for dst"); - } - return Status{}; -} - -Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2); - - // Check whether it is in_place calculation - const bool in_place = (&src1 == &dst) || (&src2 == &dst); - const bool src1_in_place = in_place && (&src1 == &dst); - - const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - // Validate in case of configured dst - if(dst.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); - } - - return Status{}; -} - -Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::F16, DataType::F32, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); - - // Check whether it is in_place calculation - const bool in_place = (src1 == dst) || (src2 == dst); - const bool src1_in_place = in_place && (src1 == dst); - - const TensorShape out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - // Validate in case of configured dst - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape)); - } - - return Status{}; -} - -Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2); - - if(is_data_type_quantized_symmetric(src1.data_type())) - { - const int32_t in1_offset = src1.quantization_info().uniform().offset; - const int32_t in2_offset = src2.quantization_info().uniform().offset; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_offset != 0, "For quantized symmetric, offset must be zero"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in2_offset != 0, "For quantized symmetric, offset must be zero"); - } - - // Check whether it is in_place calculation - const bool in_place = (&src1 == &dst) || (&src2 == &dst); - const bool src1_in_place = in_place && (&src1 == &dst); - - const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - // Validate in case of configured dst - if(dst.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst"); - ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); - - if(is_data_type_quantized_symmetric(dst.data_type())) - { - const int32_t offset = dst.quantization_info().uniform().offset; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero"); - } - } - return Status{}; -} - -CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string) -{ - CLBuildOptions build_opts; - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); - - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type())); - build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DOP=" + operation_string); - if(is_data_type_quantized(src1.data_type())) - { - const UniformQuantizationInfo iq1info = src1.quantization_info().uniform(); - const UniformQuantizationInfo iq2info = src2.quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = dst.quantization_info().uniform(); - - build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(iq1info.offset)); - build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(iq2info.offset)); - build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1info.scale)); - build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2info.scale)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale)); - } - build_opts.add_option_if(src1.data_type() == DataType::S32, "-DS32"); - - // Check whether it is in_place calculation - const bool in_place = (&src1 == &dst) || (&src2 == &dst); - const bool src1_in_place = in_place && (&src1 == &dst); - build_opts.add_option_if(in_place, "-DIN_PLACE"); - build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE"); - - return build_opts; -} - -std::pair configure_window_arithmetic_common(ITensorInfo &dst) -{ - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); - Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration)); - return std::make_pair(Status{}, win); -} - -std::pair validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) -{ - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); - const TensorShape &out_shape = broadcast_pair.first; - - auto_init_if_empty(dst, out_shape, 1, src1.data_type()); - - return configure_window_arithmetic_common(dst); -} - -std::pair validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) -{ - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); - const TensorShape &out_shape = broadcast_pair.first; - - set_shape_if_empty(dst, out_shape); - set_data_type_if_unknown(dst, DataType::U8); - - return configure_window_arithmetic_common(dst); -} - -std::pair validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) -{ - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); - const TensorShape &out_shape = broadcast_pair.first; - - auto_init_if_empty(dst, out_shape, 1, src1.data_type()); - - return configure_window_arithmetic_common(dst); -} -} // namespace - -ClElementwiseKernel::ClElementwiseKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) -{ - // Configure kernel window - auto win_config = validate_and_configure_window(*src1, *src2, *dst); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - - std::string kernel_name = "elementwise_operation_" + name(); - if(is_data_type_quantized(src1->data_type())) - { - kernel_name += "_quantized"; - } - - // Set kernel build options - CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst); - if(_act_info.enabled()) - { - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation()))); - build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a())); - build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(_act_info.b())); - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - ICLKernel::configure_internal(win_config.second); - - _config_id = generate_id_for_tuning(kernel_name, *src1, *dst); -} - -void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); - - const TensorShape &in_shape1 = src_0->info()->tensor_shape(); - const TensorShape &in_shape2 = src_1->info()->tensor_shape(); - const TensorShape &out_shape = dst->info()->tensor_shape(); - - bool can_collapse = true; - const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) - { - can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; - - const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_src1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_src2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - // Check whether it is in_place calculation - const bool in_place = (src_0 == dst) || (src_1 == dst); - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src_0, slice_src1); - add_3D_tensor_argument(idx, src_1, slice_src2); - if(!in_place) - { - add_3D_tensor_argument(idx, dst, slice); - } - - enqueue(queue, *this, slice, lws_hint()); - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1)); - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2)); - } - while(collapsed.slide_window_slice_3D(slice)); -} - -/** Logical binary */ - -void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst)); - _op = op; - configure_common(compile_context, src1, src2, dst); -} - -Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) -{ - ARM_COMPUTE_UNUSED(op); - ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); - - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first); - - return Status{}; -} - -std::string ClLogicalBinaryKernel::name() -{ - switch(_op) - { - case LogicalOperation::And: - return "AND"; - case LogicalOperation::Or: - return "OR"; - case LogicalOperation::Not: - /* fall through */ - default: - ARM_COMPUTE_ASSERT(true); - } - return ""; -} - -std::pair ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) -{ - return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst); -} - -CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) -{ - // The arithmetic utility functions can be share - return generate_build_options_with_arithmetic_rules(src1, src2, dst, name()); -} - -std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) -{ - return generate_id_for_tuning_common(kernel_name, src1, dst); -} - -/** Arithmetic operations with saturation*/ -void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, - const ConvertPolicy &policy, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info)); - auto padding_info = get_padding_info({ input1, input2, output }); - - _policy = policy; - _op = op; - _act_info = act_info; - configure_common(compile_context, input1, input2, output); - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(op, policy); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first); - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type())); - - return Status{}; -} - -std::pair ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) -{ - return validate_and_configure_window_for_arithmetic_operators(input1, input2, output); -} - -CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) -{ - const bool has_float_out = is_data_type_float(output.data_type()); - auto build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name()); - build_options.add_option((_policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); - return build_options; -} - -std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) -{ - auto config_id = generate_id_for_tuning_common(kernel_name, input1, output); - config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_"; - config_id += lower_string(string_from_data_layout(input1.data_layout())); - return config_id; -} - -std::string ClSaturatedArithmeticKernel::name() -{ - return supported_sat_arithmetic_ops[_op]; -} - -/** Arithmetic operations*/ -void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info)); - auto padding_info = get_padding_info({ src1, src2, dst }); - - _op = op; - _act_info = act_info; - configure_common(compile_context, src1, src2, dst); - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); - if(op == ArithmeticOperation::DIV) - { - // Partial integer support S32/F32/F16 - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); - } - else if(op == ArithmeticOperation::POWER) - { - // Power operators doesn't support integer arithmetic - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first); - } - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); - - return Status{}; -} -std::pair ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) -{ - if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER) - { - // Division and Power operators don't support integer arithmetic - return validate_and_configure_window_for_division(src1, src2, dst); - } - else - { - return validate_and_configure_window_for_arithmetic_operators(src1, src2, dst); - } -} - -CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) -{ - return generate_build_options_with_arithmetic_rules(src1, src2, dst, name()); -} -std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) -{ - return generate_id_for_tuning_common(kernel_name, src1, dst); -} - -std::string ClArithmeticKernel::name() -{ - return supported_arithmetic_ops[_op]; -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.h b/src/core/gpu/cl/kernels/ClElementwiseKernel.h deleted file mode 100644 index 4525cec55b..0000000000 --- a/src/core/gpu/cl/kernels/ClElementwiseKernel.h +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H -#define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H - -#include "src/core/KernelTypes.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for an element-wise operation kernel - * - * Element-wise operation is computed by: - * @f[ dst(x,y) = OP(src1(x,y), src2(x,y))@f] - * - * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead. - * - */ -class ClElementwiseKernel : public IClKernel -{ -public: - ClElementwiseKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementwiseKernel); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; - -protected: - /** The name of the operation */ - virtual std::string name() = 0; - - /** Configure kernel for a given list of arguments - * - * @param[in] src1 First source tensor info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[in] dst Destination tensor info. Data types supported: same as @p src1. - * - * @return a pair of Status and Window - */ - virtual std::pair validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0; - - /** Generate the build options for the specific kernel - * - * @reutrn a CLBuildOptions struct - */ - virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0; - - /** Generate the identifier for tuning - * - * @reutrn a string - */ - virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0; - - /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff) - * - */ - void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); - - ActivationLayerInfo _act_info{}; -}; - -class ClLogicalBinaryKernel : public ClElementwiseKernel -{ -public: - ClLogicalBinaryKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogicalBinaryKernel); - /** Function to configure kernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] op Logical binary operation to be executed. - * @param[in] src1 First source tensor info. Data types supported: U8. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[in] dst Destination tensor info. Data types supported: same as @p src1. - */ - void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClLogicalBinaryKernel::configure() - * - * @return a status - */ - static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); - -private: - // Inherited methods overridden: - std::string name() override; - std::pair validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; - CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; - std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; - - LogicalOperation _op{ LogicalOperation::Unknown }; -}; - -/** Addition operation */ -class ClSaturatedArithmeticKernel : public ClElementwiseKernel -{ -public: - ClSaturatedArithmeticKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSaturatedArithmeticKernel); - /** Static function to check if given info will lead to a valid configuration of @ref ClSaturatedArithmeticKernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] op Arithmetic operation to be executed. - * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. - * @param[in] output Output tensor info. Data types supported: Same as @p input1. - * @param[in] policy Policy to use to handle overflow. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClSaturatedArithmeticKernel::configure() - * - * @return a status - */ - static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - -protected: - // Inherited methods overridden: - std::string name() override; - std::pair validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override; - CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override; - std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override; - -private: - ConvertPolicy _policy{}; - ArithmeticOperation _op{}; -}; - -class ClArithmeticKernel : public ClElementwiseKernel -{ -public: - ClArithmeticKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClArithmeticKernel); - - /** Static function to check if given info will lead to a valid configuration of @ref ClArithmeticKernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] op Arithmetic operation to be executed. - * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. - * @param[in] dst Destination tensor info. Data types supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClArithmeticKernel::configure() - * - * @return a status - */ - static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - -protected: - // Inherited methods overridden: - std::string name() override; - std::pair validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; - CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; - std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; - -private: - ArithmeticOperation _op{}; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp deleted file mode 100644 index 1525c0fe54..0000000000 --- a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op) -{ - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); - if(op == ElementWiseUnary::LOGICAL_NOT) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8); - } - else if(op == ElementWiseUnary::NEG) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32); - } - - // Validate in case of configured dst - if(dst.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); - } - - return Status{}; -} -} // namespace - -ClElementWiseUnaryKernel::ClElementWiseUnaryKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - auto padding_info = get_padding_info({ src, dst }); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op)); - - const std::string kernel_name = "elementwise_unary"; - const int vec_size_x = 16 / dst->element_size(); - const int dst_width_x = dst->tensor_shape().x(); - const bool multi_access_x = (dst_width_x / vec_size_x > 0); - - // Set kernel build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(dst_width_x - vec_size_x, 0))); - switch(op) - { - case ElementWiseUnary::RSQRT: - build_opts.add_option("-DOPERATION=rsqrt_op"); - break; - case ElementWiseUnary::EXP: - build_opts.add_option("-DOPERATION=exp_op"); - break; - case ElementWiseUnary::NEG: - build_opts.add_option("-DOPERATION=neg_op"); - break; - case ElementWiseUnary::SIN: - build_opts.add_option("-DOPERATION=sin_op"); - break; - case ElementWiseUnary::ABS: - build_opts.add_option("-DOPERATION=fabs_op"); - break; - case ElementWiseUnary::LOG: - build_opts.add_option("-DOPERATION=natural_log_op"); - break; - case ElementWiseUnary::ROUND: - build_opts.add_option("-DOPERATION=round_op"); - break; - case ElementWiseUnary::LOGICAL_NOT: - build_opts.add_option("-DOPERATION=logical_not_op"); - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*dst); - if(multi_access_x) - { - win.set(Window::DimX, - Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); - } - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClElementWiseUnaryKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op) -{ - ARM_COMPUTE_UNUSED(op); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src, *dst, op)); - - return Status{}; -} - -void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h deleted file mode 100644 index 64cc2f7afc..0000000000 --- a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H -#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the elementwise unary operator */ -class ClElementWiseUnaryKernel : public IClKernel -{ -public: - ClElementWiseUnaryKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementWiseUnaryKernel); - /** Initialise the kernel's srcs, dst. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src First source tensor info. Data types supported: F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - * @param[in] op Element wise unary operation to perform. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClElementWiseUnaryKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClFillKernel.cpp b/src/core/gpu/cl/kernels/ClFillKernel.cpp deleted file mode 100644 index f213bf8e6a..0000000000 --- a/src/core/gpu/cl/kernels/ClFillKernel.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClFillKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -ClFillKernel::ClFillKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, - const PixelValue &constant_value, - Window *window) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); - ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window)); - - const DataType data_type = tensor->data_type(); - const int vec_size_x = 16 / tensor->element_size(); - - // Create and update the window (if needed) - _full_window = calculate_max_window(*tensor); - Window win = _full_window; - if(window != nullptr) - { - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window); - win = *window; - } - - const int output_width_x = win.num_iterations(0); - const bool multi_access_x = output_width_x >= vec_size_x; - const bool remainder_x = output_width_x % vec_size_x > 0; - - if(multi_access_x) - { - win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); - } - ICLKernel::configure_internal(win); - - // Create kernel - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type)); - build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(output_width_x - vec_size_x, 0))); - _kernel = create_kernel(compile_context, "memset", build_opts.options()); -} - -Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window) -{ - ARM_COMPUTE_UNUSED(tensor); - ARM_COMPUTE_UNUSED(constant_value); - if(window != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1); - } - return Status{}; -} - -void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto tensor = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - - // Collapse all the batches on the third - Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, tensor, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClFillKernel.h b/src/core/gpu/cl/kernels/ClFillKernel.h deleted file mode 100644 index ecc2546e4a..0000000000 --- a/src/core/gpu/cl/kernels/ClFillKernel.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_FILL_KERNEL_H -#define ARM_COMPUTE_CL_FILL_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for filling the planes of a tensor */ -class ClFillKernel : public IClKernel -{ -public: - ClFillKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFillKernel); - /** Initialise the kernel's tensor and filling value - * - * @param[in] compile_context The compile context to be used. - * @param[in,out] tensor Input tensor info. Supported data types: All. - * @param[in] constant_value The value used to fill the planes of the tensor - * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClFillKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - Window _full_window{}; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_FILL_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClFloorKernel.cpp b/src/core/gpu/cl/kernels/ClFloorKernel.cpp deleted file mode 100644 index 2047128963..0000000000 --- a/src/core/gpu/cl/kernels/ClFloorKernel.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClFloorKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - - // Validate in case of configured output - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} -} // namespace - -ClFloorKernel::ClFloorKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Auto initialize output - auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type()); - - // Validate - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - auto padding_info = get_padding_info({ src, dst }); - - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); - const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers)); - - // Create kernel - _kernel = create_kernel(compile_context, "floor_layer", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps(vec_size_x)); - IClKernel::configure_internal(win); - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); - return Status{}; -} - -void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClFloorKernel.h b/src/core/gpu/cl/kernels/ClFloorKernel.h deleted file mode 100644 index 57c9906f2c..0000000000 --- a/src/core/gpu/cl/kernels/ClFloorKernel.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_FLOOR_KERNEL_H -#define ARM_COMPUTE_CL_FLOOR_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to perform a floor operation */ -class ClFloorKernel : public IClKernel -{ -public: - ClFloorKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFloorKernel); - /** Configure kernel for a given list of arguments - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data type supported: F16/F32. - * @param[out] dst Destination tensor info. Same as @p src - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClFloorKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_FLOOR_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp deleted file mode 100644 index ec0a3bf8e0..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(src0->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); - - const int m = gemm_info.m(); - const int n = gemm_info.n(); - const int k = gemm_info.k(); - - ARM_COMPUTE_UNUSED(m); - ARM_COMPUTE_UNUSED(n); - ARM_COMPUTE_UNUSED(k); - - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast(k)); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast(n)); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast(k)); - if(gemm_info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast(m)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast(m)); - } - - if(dst->total_size() != 0) - { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - } - - return Status{}; -} - -std::pair validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - bool reinterpret_dst_as_3d = (gemm_info.depth_output_gemm3d() != 0); - - Window win{}; - bool window_changed = false; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_dst_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_dst_as_3d) - { - reinterpret_dst_as_3d = false; - } - - // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); - - TensorInfo tmp_info(*dst); - - if(reinterpret_dst_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - // RHS matrix still needs padding on the X - AccessWindowStatic src1_access(src1, 0, 0, - ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), - src1->dimension(1)); - - window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel() -{ - _type = CLKernelType::GEMM; -} - -void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); - - _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - - // We still need padding on the X dimension for the RHS matrix - auto padding_info = get_padding_info({ src0, dst }); - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_src0 = src0->num_dimensions(); - _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, - // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by dst->info()->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1); - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % lhs_info.m0; - const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; - - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. - // NOTE: This might have implications on heuristics and performance - const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1))); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - std::string kernel_name("gemmlowp_mm_native"); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; - _config_id += "_"; - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k()); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.k0); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed) - .first); - - return Status{}; -} - -void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - if(src1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; - const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, src0, slice); - add_2D_tensor_argument(idx, src1, slice_b); - add_2D_tensor_argument(idx, dst, slice); - _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h deleted file mode 100644 index eaa125fbf2..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H -#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */ -class ClGemmLowpMatrixMultiplyNativeKernel : public IClKernel -{ -public: - ClGemmLowpMatrixMultiplyNativeKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyNativeKernel); - /** Initialise the kernel's input and dst. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src0 Source tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] src1 Source tensor containing the RHS matrix. Data type supported: same as @p src0 - * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp deleted file mode 100644 index 44fda01ded..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -using namespace misc::shape_calculator; - -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose); - ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); - - const int m = gemm_info.m(); - const int n = gemm_info.n(); - const int k = gemm_info.k(); - - TensorShape tensor_shape0{ src0->tensor_shape() }; - tensor_shape0.set(0, k); - tensor_shape0.set(1, m); - - TensorShape tensor_shape1{ src1->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - - const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - - if(dst->total_size() != 0) - { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - } - - return Status{}; -} - -std::pair validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info, - ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - - // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); - - TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - return std::make_pair(Status{}, collapsed); -} -} // namespace - -ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel() -{ - _type = CLKernelType::GEMM; -} - -void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); - - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); - _k = gemm_info.k(); - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - - // Check if we need to slide the matrix B - const unsigned int num_dimensionssrc0 = src0->num_dimensions(); - _slide_matrix_b = (src1->num_dimensions() >= num_dimensionssrc0); - - auto padding_info = get_padding_info({ src0, src1, dst }); - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1); - - const unsigned int partial_store_m0 = internal_m % lhs_info.m0; - const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); - build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m())); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); - build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); - build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - - std::string kernel_name("gemmlowp_mm_reshaped_"); - kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; - kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; - _config_id += "_"; - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k()); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.k0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.v0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.h0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.interleave); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.interleave); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed) - .first); - - return Status{}; -} - -void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - if(src1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4; - const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, src0, slice); - add_2D_tensor_argument(idx, src1, slice_b); - add_2D_tensor_argument(idx, dst, slice); - _kernel.setArg(idx++, static_cast(_k)); - _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h deleted file mode 100644 index 99cff011d1..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H -#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped - * - * @note The input matrices @p src0 and @p src1 must be reshaped through: - * - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel - * - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel - */ -class ClGemmLowpMatrixMultiplyReshapedKernel : public IClKernel -{ -public: - ClGemmLowpMatrixMultiplyReshapedKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedKernel); - /** Initialise the kernel's input and dst. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src0 Source tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] src1 Source tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32 - * @param[in] lhs_info LHS matrix information used for reshaping the src0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - bool _slide_matrix_b{ true }; - bool _reinterpret_output_as_3d{ false }; - unsigned int _k{ 1 }; - bool _use_dummy_work_items{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp deleted file mode 100644 index 9d626936ff..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp +++ /dev/null @@ -1,544 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -using namespace misc::shape_calculator; - -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(src0->data_type() == DataType::QASYMM8) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - - const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; - const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; - const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); - - const int m = gemm_info.m; - const int n = gemm_info.n; - const int k = gemm_info.k; - - TensorShape tensor_shape1{ src1->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast(k)); - if(gemm_info.reinterpret_input_as_3d) - { - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast(m)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast(m)); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - - const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(dst->total_size() != 0) - { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - if(output_stage.type == GEMMLowpOutputStageType::NONE) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); - } - } - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0)); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), - "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); - - // Checks performed if the dst stage needs to be fused - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - // If a_offset == 0, vector_sum_col can be a nullptr - if(gemm_info.a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(gemm_info.b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if mm result is a 3D reinterpretation - const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2])); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]); - - if(expected_dst_shape.num_dimensions() > 1) - { - const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - TensorShape collapsed_dst_shape(expected_dst_shape); - collapsed_dst_shape.collapse_from(dst_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx], - "vector_sum_row must have the same number of batches of dst tensor"); - - if(gemm_info.a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); - } - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - - if(output_multipliers != nullptr && output_shifts != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) - { - ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0)); - } - } - } - return Status{}; -} - -std::pair validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) -{ - const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_output_as_3d = false; - } - - // dst tensor auto initialization if not yet initialized - const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(output_stage.type != GEMMLowpOutputStageType::NONE) - { - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); - } - else - { - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32)); - } - - TensorInfo tmp_info(*dst); - - if(reinterpret_output_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0; - num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - if(gemm_info.a_offset != 0) - { - AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access); - } - // No access window needed for vector_sum_row - ARM_COMPUTE_UNUSED(vector_sum_row); - - if(bias != nullptr) - { - AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, bias_access); - } - - if(output_multipliers != nullptr && output_stage.is_quantized_per_channel) - { - AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x); - AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access); - } - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel() -{ - _type = CLKernelType::GEMM; -} - -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); - - auto padding_info = get_padding_info({ src0, src1, dst, vector_sum_row }); - const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; - const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; - const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - const int32_t a_offset = gemm_info.a_offset; - const int32_t b_offset = gemm_info.b_offset; - - _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - _is_quantized_per_channel = output_stage.is_quantized_per_channel; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_src0 = src0->num_dimensions(); - _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, - // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); - - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. - // NOTE: This might have implications on heuristics and performance - const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % internal_m0; - const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); - - std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_"); - kernel_name += rhs_info.transpose ? "t" : "nt"; - - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - kernel_name += "_fused_output_stage_fixedpoint"; - _fuse_output_stage = true; - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0 && vector_sum_col != nullptr) - { - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); - } - // If b_offset == 0, vector_sum_row can be a nullptr - build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0))); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); - build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); - build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); - - const int min = output_stage.gemmlowp_min_bound; - const int max = output_stage.gemmlowp_max_bound; - - PixelValue min_val{}; - PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(dst->data_type()); - build_opts.add_option_if(min != min_val.get(), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if(max != max_val.get(), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; - _config_id += "_"; - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.k0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.h0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.interleave); - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, - bias != nullptr ? bias->clone().get() : nullptr, - output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, - output_shifts != nullptr ? output_shifts->clone().get() : nullptr, - num_elements_processed) - .first); - - return Status{}; -} - -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - const auto output_shifts = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); - const auto output_multipliers = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - if(src1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; - const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - // Set window for vector_sum_col - Window win_vector_sum_col = slice; - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row = slice; - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window biases_slice = slice; - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, src0, slice); - add_2D_tensor_argument(idx, src1, slice_b); - add_2D_tensor_argument(idx, dst, slice); - _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - idx++; - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor - idx++; - } - - if(_fuse_output_stage) - { - add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); - } - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h deleted file mode 100644 index 9e52b38249..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H -#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has been reshaped - * - * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel - * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported - */ -class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel : public IClKernel -{ -public: - ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel); - /** Initialise the kernel's source and destination. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0 - * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. - * Only the following values are supported for LHS info: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * Only the following values are supported for RHS info: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same as lhs_info.k0 - * rhs_info.transpose: true - * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 - * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 - * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. - * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr, - ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, - const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _is_quantized_per_channel{ false }; - bool _fuse_output_stage{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp deleted file mode 100644 index e491cca914..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" - -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); - } - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); - - TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], - "mm_result tensor must have the same number of batches of output tensor"); - - if(a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - return Status{}; -} -} // namespace - -ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t k, int32_t a_offset, int32_t b_offset) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); - - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias }); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->num_dimensions() > 1 - && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); - } - // If b_offset == 0, vector_sum_row can be a nullptr - build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - std::string kernel_name("gemmlowp_offset_contribution"); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); - IClKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name + "_"; - _config_id += support::cpp11::to_string(mm_result->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->dimension(2)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); - return Status{}; -} - -void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); - - const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - const auto mm_result = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_SRC_DST)); - - Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Set window for vector_sum_col - Window win_vector_sum_col = slice; - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row = slice; - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window biases_slice = slice; - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, mm_result, slice); - add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h deleted file mode 100644 index d1712f4f4b..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H -#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), - * and adds to it the offset contribution of matrix A and matrix B in-place. - * - * The final result is: - * - * mm_result[i][k] = mm_result[i][k] + - * (vector_sum_col[k] * a_offset) + - * (vector_sum_row[i] * b_offset) + - * (a_offset * b_offset * k) - * - */ -class ClGemmLowpOffsetContributionKernel : public IClKernel -{ -public: - ClGemmLowpOffsetContributionKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionKernel); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - */ - void configure(const CLCompileContext &compile_context, - const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t k, int32_t a_offset, int32_t b_offset); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpOffsetContributionKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp deleted file mode 100644 index 1e2d7d7efe..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" - -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, - int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); - } - - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) - { - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0)); - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0)); - } - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); - } - - // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); - ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); - - TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) - { - const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; - - TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); - vector_sum_row_shape.collapse_from(1); - output_shape.collapse_from(output_batch_idx); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], - "mm_result tensor must have the same number of batches of output tensor"); - - if(a_offset != 0) - { - TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); - vector_sum_col_shape.collapse_from(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); - } - } - } - - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE); - // Checks performed when output is configured - if((dst != nullptr) && (dst->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, dst); - } - - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect"); - - return Status{}; -} -} // namespace - -ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutputStageKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); - - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts }); - - const int min = output_stage.gemmlowp_min_bound; - const int max = output_stage.gemmlowp_max_bound; - - _is_quantized_per_channel = output_stage.is_quantized_per_channel; - - // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->num_dimensions() > 1 - && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); - - // Auto initialize the output - auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type)); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); - - // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) - { - build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); - build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); - } - // If b_offset == 0, vector_sum_row can be a nullptr - build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); - build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); - build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); - build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - - PixelValue min_val{}; - PixelValue max_val{}; - std::tie(min_val, max_val) = get_min_max(dst->data_type()); - build_opts.add_option_if((min > min_val.get()), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < max_val.get()), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - - std::string kernel_name("gemmlowp_offset_contribution"); - kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name + "_"; - _config_id += support::cpp11::to_string(mm_result->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mm_result->dimension(2)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); - return Status{}; -} - -void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto mm_result = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - const auto vector_sum_col = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - const auto output_shifts = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); - const auto output_multipliers = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Set window for vector_sum_col - Window win_vector_sum_col = slice; - win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Set window for vector_sum_row - Window win_vector_sum_row = slice; - win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); - win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Window biases_slice = slice; - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, mm_result, slice); - add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); - add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); - add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); - add_3D_tensor_argument(idx, dst, slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); - add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h deleted file mode 100644 index 977f2eac53..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H -#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage. - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution - * of matrix A and matrix B and performs the output stage defined by the output_stage argument - * - * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo. - */ -class ClGemmLowpOffsetContributionOutputStageKernel : public IClKernel -{ -public: - ClGemmLowpOffsetContributionOutputStageKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionOutputStageKernel); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 - * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. - * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result - * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. - * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. - * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] k Number of matrix A columns or Matrix B rows - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_stage GEMMLowp output stage info - * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). - * Supported data types: S32 - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, - int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - bool _is_quantized_per_channel{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp deleted file mode 100644 index 8aec1654d9..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" - -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); - } - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} -} // namespace - -ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, - const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); - - return Status{}; -} - -void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, - const GEMMLowpOutputStageInfo *info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); - - auto padding_info = get_padding_info({ src, bias, dst }); - - // dst auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); - - // Set the arguments to pass at compile time - auto min = info->gemmlowp_min_bound; - auto max = info->gemmlowp_max_bound; - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset)); - build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier)); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - // Create src window - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Setup bias slice - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx1, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h deleted file mode 100644 index c935aa7ec4..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H -#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16 - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Round to nearest division by a power-of-two using result_shift - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16. - */ -class ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel : public IClKernel -{ -public: - ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel); - /** Initialise the kernel's source and destination. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. - * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16. - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp deleted file mode 100644 index 9b488ff329..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" - -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) - || info->gemmlowp_min_bound > info->gemmlowp_max_bound); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); - } - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} -} // namespace - -ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, - const GEMMLowpOutputStageInfo *info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); - - return Status{}; -} - -void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, - const GEMMLowpOutputStageInfo *info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); - - auto padding_info = get_padding_info({ src, bias, dst }); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); - - auto min = info->gemmlowp_min_bound; - auto max = info->gemmlowp_max_bound; - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier)); - build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - // Create input window - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - // Setup bias slice - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx1, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h deleted file mode 100644 index eff8c4b2be..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H -#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier - * -# Add bias to final result if bias tensor is not a nullptr - * -# Requantize - * -# Add offset to each result - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values to - * - to the [0..255] range and cast to QASYMM8. - * - to the [-128..127] range and cast to QASYMM8_SIGNED. - */ -class ClGemmLowpQuantizeDownInt32ScaleByFloatKernel : public IClKernel -{ -public: - ClGemmLowpQuantizeDownInt32ScaleByFloatKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFloatKernel); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. - * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] info Output stage info. Used to pass the quantized output data type - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp deleted file mode 100644 index 9a25973a93..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" - -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); - - // Check biases if exist - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); - } - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - } - - return Status{}; -} -} //namespace - -ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); - - return Status{}; -} - -void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, - const GEMMLowpOutputStageInfo *output_stage) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage)); - - auto padding_info = get_padding_info({ src, bias, dst }); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); - - // Set the arguments to pass at compile time - auto min = output_stage->gemmlowp_min_bound; - auto max = output_stage->gemmlowp_max_bound; - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset)); - build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier)); - build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift)); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), - "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), - "-DMAX_BOUND=" + support::cpp11::to_string(max)); - build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_3D(); - - unsigned int idx1 = num_arguments_per_3D_tensor(); - if(bias != nullptr) - { - Window biases_slice(slice); - biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - add_1D_tensor_argument(idx1, bias, biases_slice); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx1, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h deleted file mode 100644 index c5374755c8..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H -#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED - * - * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. - * The following computations will be performed by the kernel: - * - * -# Add offset terms to final result - * -# Multiply each entry of result by result_mult_int - * -# Add bias to final result if bias tensor is not a nullptr - * -# Shift the int32 accumulator by result_shift - * -# Clamp the value between the specified min and max bounds - * -# Clamp the resulting int32 values: - * -# -to the [0..255] range and cast to QASYMM8. - * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. - */ -class ClGemmLowpQuantizeDownInt32ScaleKernel : public ICLKernel -{ -public: - ClGemmLowpQuantizeDownInt32ScaleKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleKernel); - /** Initialise the kernel's source and destination. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. - * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED - * @param[in] output_stage GEMMLowp output stage metadata. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp deleted file mode 100644 index b4886805fb..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/KernelDescriptors.h" - -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8); - - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); - } - return Status{}; -} - -Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); - } - return Status{}; -} -} // namespace - -IClGemmLowpReductionKernel::IClGemmLowpReductionKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32); - - auto padding_info = get_padding_info({ mtx_a, vector_sum_row }); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->dimension(0))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->data_type())); - build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); - - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); - - std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : ""); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - // This kernel does not need padding - Window win = calculate_max_window(*vector_sum_row, Steps()); - ICLKernel::configure_internal(win); - - _config_id = kernel_name; - _config_id += "_"; - _config_id += support::cpp11::to_string(mtx_a->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mtx_a->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(mtx_a->dimension(2)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); - - return Status{}; -} - -void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); - Window slice_in = collapsed.first_slice_window_2D(); - Window slice_out = collapsed.first_slice_window_2D(); - - // Setup input slice. Its dimensions are increased in the cl kernel. - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice_in); - add_2D_tensor_argument(idx, dst, slice_out); - enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); -} - -void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32); - - auto padding_info = get_padding_info({ mtx_b, vector_sum_col }); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0)); - - // Set the arguments to pass at compile time - CLBuildOptions build_opts; - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0))); - build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type())); - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->data_type())); - build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); - - // Create kernel - _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*vector_sum_col, Steps(num_elems_processed_per_iteration)); - IClKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); - - return Status{}; -} - -void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY); - - Window slice_out = collapsed.first_slice_window_2D(); - Window slice_in = slice_out; - - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice_in); - add_2D_tensor_argument(idx, dst, slice_out); - enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h deleted file mode 100644 index 11188ed062..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H -#define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Common interface for all OpenCL reduction kernels */ -class IClGemmLowpReductionKernel : public IClKernel -{ -public: - IClGemmLowpReductionKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmLowpReductionKernel); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0; -}; - -/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class ClGemmLowpMatrixAReductionKernel : public IClGemmLowpReductionKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. - * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; - -/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. - * - * @note This stage is needed to handle the offset of matrix product - * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md - */ -class ClGemmLowpMatrixBReductionKernel : public IClGemmLowpReductionKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 - * @param[in] info Kernel metadata: - * - k Number of matrix columns/rows depending on the type of reduction. - * - is_reshaped True if the matrix has been reshaped. - * - scalar Scalar value to multiply each reduced column/row by. - * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp deleted file mode 100644 index 6079644935..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp +++ /dev/null @@ -1,538 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/helpers/float_ops.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -using ElementsProcessed = Steps; - -inline Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float beta, - bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (src0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The src1 tensor cannot have more than 2 dimensions if src0 has to be reinterpreted as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (src2 != nullptr) - && (!reshape_info.broadcast_bias()), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); - - if(!is_interleaved_transposed) - { - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != src1->dimension(1)); - - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) - { - const unsigned int m = reshape_info.reinterpret_input_as_3d() ? src0->dimension(1) * src0->dimension(2) : src0->dimension(1); - const unsigned int n = src1->dimension(0); - const unsigned int src2_dim0 = src2->dimension(0); - const unsigned int src2_dim1 = src2->dimension(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(reshape_info.broadcast_bias()) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); - } - } - } - else - { - GEMMRHSMatrixInfo rhs_info; - GEMMLHSMatrixInfo lhs_info; - const auto m = static_cast(reshape_info.m()); - const auto n = static_cast(reshape_info.n()); - const int k = reshape_info.k(); - const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); - const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); - rhs_info.n0 = max_cl_vector_width / src1->element_size(); - rhs_info.k0 = 1; - rhs_info.h0 = mult_transpose1xW_width; - rhs_info.interleave = false; - rhs_info.transpose = false; - lhs_info.m0 = 4; - lhs_info.k0 = 4; - lhs_info.v0 = mult_interleave4x4_height; - lhs_info.interleave = true; - lhs_info.transpose = true; - - TensorShape tensor_shape0{ src0->tensor_shape() }; - tensor_shape0.set(0, k); - tensor_shape0.set(1, m); - - TensorShape tensor_shape1{ src1->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - - const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) - { - const unsigned int src2_dim0 = src2->dimension(0); - const unsigned int src2_dim1 = src2->dimension(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(reshape_info.broadcast_bias()) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); - } - } - } - - if(dst->total_size() != 0) - { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); - } - - return Status{}; -} - -inline std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, - ElementsProcessed &num_elements_processed) -{ - ARM_COMPUTE_UNUSED(beta); - bool window_changed = false; - Window win{}; - Window win_out{}; - - const DataType data_type = src0->data_type(); - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); - bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0); - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_input_as_3d = false; - reinterpret_output_as_3d = false; - } - - // dst tensor auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info))); - - TensorInfo tmp_info(*dst); - - if(reinterpret_output_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - if(is_interleaved_transposed) - { - // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set - ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d()); - - // Configure kernel window - num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); - num_elems_processed_per_iteration_y = 4; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - if(src2 != nullptr) - { - const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; - - const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y; - - AccessWindowStatic src2_access(src2, 0, 0, - ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), - ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y)); - - window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop - } - } - else // The input tensors have not been reshaped - { - // Special case for 1xN, 2xN, 3xN and 4xN src0 tensor. num_elems_processed_per_iteration_x is set up for the default case. - num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); - num_elems_processed_per_iteration_y = std::min(static_cast(dst->dimension(1)), 4); - - // Create kernels according to the architecture, data type and input size. - GPUTarget arch_target = get_arch_from_target(gpu_target); - if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32) - { - num_elems_processed_per_iteration_x = (src1->dimension(0) <= 1000 && src0->num_dimensions() == 1) ? 2 : 4; - } - - // Configure window - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1)); - AccessWindowStatic src1_access(src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1)); - AccessWindowStatic dst_access(dst, 0, 0, - dst->dimension(0), - dst->dimension(1)); - - if(src2 != nullptr) - { - const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; - - AccessWindowStatic src2_access(src2, 0, 0, - ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), - src2->dimension(1)); - - window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - else - { - window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -ClGemmMatrixMultiplyKernel::ClGemmMatrixMultiplyKernel() -{ - _type = CLKernelType::GEMM; -} - -void ClGemmMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, - float beta, - bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, beta, - is_interleaved_transposed, reshape_info, fp_mixed_precision)); - - auto padding_info = is_interleaved_transposed ? get_padding_info({ src0, src1, dst }) : get_padding_info({ src0, dst }); - - _reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); - _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0); - _add_bias = src2 != nullptr; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_src0 = _reinterpret_input_as_3d ? src0->num_dimensions() - 1 : src0->num_dimensions(); - - _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); - - const DataType data_type = src0->data_type(); - - // Get target architecture - GPUTarget gpu_target = get_target(); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info, - gpu_target, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false) - // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by dst->dimension(1) - const unsigned int internal_m = _reinterpret_output_as_3d ? dst->dimension(1) * dst->dimension(2) : dst->dimension(1); - const unsigned int n = dst->dimension(0); - - const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); - const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); - - const unsigned int m0 = num_elements_processed.y(); - const unsigned int n0 = num_elements_processed.x(); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % m0; - const unsigned int partial_store_n0 = n % n0; - - // Create build options - CLBuildOptions build_opts; - - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); - build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); - build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); - build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS"); - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation()))); - build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a())); - build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b())); - build_opts.add_option("-DIN1_DIM_X=" + support::cpp11::to_string(src1->dimension(0))); - - const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST; - - std::string kernel_name; - if(is_interleaved_transposed) - { - const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); - const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); - - build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(src1->dimension(0) / (n0 * mult_transpose1xW_width))); - build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width)); - build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - - if(is_data_type_float(data_type) && is_bifrost) - { - kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost"; - } - else - { - kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)); - if(fp_mixed_precision && data_type == DataType::F16) - { - // currently wider accumulator is only supported for fp16 kernels. - kernel_name += "_acc32"; - } - } - } - else // The input tensors have not been reshaped - { - build_opts.add_option("-DN=" + support::cpp11::to_string(n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(src0->dimension(0))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - - // Create kernels according to the architecture, data type and input size. - if(is_data_type_float(data_type) && is_bifrost) - { - kernel_name = "gemm_mm_floating_point"; - - if(src0->num_dimensions() != 1) - { - kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost"; - if(fp_mixed_precision && data_type == DataType::F16) - { - // currently wider accumulator is only supported for fp16 kernels. - kernel_name += "_acc32"; - } - } - else if(src1->dimension(0) <= 1000 && data_type == DataType::F32) - { - // The first kernel is optimized for the case of 1000 or less dst elements (e.g. FC8 of AlexNet and VGG-16, and - // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 dst elements (e.g. - // FC6 and FC7 of AlexNet and VGG-16). - kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000"; - } - - // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels - // via exhaustive autotuning over a range of representative layer configurations. - set_lws_hint(cl::NDRange(4)); - } - else // (MIDGARD and F32) or (F16) - { - kernel_name = "gemm_mm_floating_point"; - } - } - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = "gemm_"; - _config_id += (is_interleaved_transposed ? "reshaped_" : ""); - _config_id += (_add_bias ? "add_bias_" : ""); - _config_id += (reshape_info.broadcast_bias() ? "broadcast_bias_" : ""); - _config_id += (fp_mixed_precision ? "fp_mixed_" : ""); - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += lower_string(string_from_data_type(src0->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(3)); - _config_id += "_"; - _config_id += (is_interleaved_transposed ? support::cpp11::to_string(src1->dimension(0)) : support::cpp11::to_string(src1->dimension(1))); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info) -{ - // Note: num_elements_processed will be set in validate_and_configure_window() - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(activation_info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - (src2 != nullptr) ? src2->clone().get() : nullptr, - dst->clone().get(), - beta, - is_interleaved_transposed, - reshape_info, - gpu_target, - num_elements_processed) - .first); - - return Status{}; -} - -void ClGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - - if(src1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0; - - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias; - const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias; - const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, src0, slice); - add_2D_tensor_argument(idx, src1, slice_b); - if(_add_bias) - { - add_2D_tensor_argument(idx, src2, slice); - } - add_2D_tensor_argument(idx, dst, slice); - _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); - if(_add_bias) - { - _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[2])); - } - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h deleted file mode 100644 index c303f78b07..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H -#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result. - * For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object - * - * @note If the input tensors @p src0 and @p src1 have been reshaped respectively with @ref ClGemmReshapeLhsMatrixKernel" and @ref ClGemmReshapeRhsMatrixKernel, - * the flag @p is_interleaved_transposed must be set to true - * - * @attention @p src1 tensor must have at least 2 dimensions (matrix) - */ -class ClGemmMatrixMultiplyKernel : public IClKernel -{ -public: - ClGemmMatrixMultiplyKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyKernel); - /** Initialise the kernel's input, output and alpha - * - * @param[in] compile_context The compile context to be used. - * @param[in] src0 Input tensor containing the Matrix A. Data types supported: F16/F32 - * @param[in] src1 Input tensor containing the Matrix B. Data type supported: same as @p src0 - * @param[in] src2 Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p src0 - * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0 - * @param[in] alpha Weight of the matrix product - * @param[in] beta (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported. - * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref ClGemmReshapeLhsMatrixKernel and @ref ClGemmReshapeRhsMatrixKernel - * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped - * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy - * @param[in] activation_info (Optional) Activation to apply after the matrix multiplication - * - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta = 0.f, - bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmMatrixMultiplyKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -public: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _add_bias{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp deleted file mode 100644 index 5ae55ab04a..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/helpers/float_ops.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) - && (!gemm_info.broadcast_bias), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native"); - - const unsigned int m = gemm_info.m; - const unsigned int n = gemm_info.n; - const unsigned int k = gemm_info.k; - - ARM_COMPUTE_UNUSED(m); - ARM_COMPUTE_UNUSED(n); - ARM_COMPUTE_UNUSED(k); - - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k); - if(gemm_info.reinterpret_input_as_3d) - { - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m); - } - - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) - { - const unsigned int src2_dim0 = src2->dimension(0); - const unsigned int src2_dim1 = src2->dimension(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(gemm_info.broadcast_bias) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); - } - } - - if(dst->total_size() != 0) - { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_output_as_3d = false; - } - - // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); - - TensorInfo tmp_info(*dst); - - if(reinterpret_output_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowStatic src0_access(src0, 0, 0, - src0->dimension(0), - src0->dimension(1)); - AccessWindowStatic src1_access(src1, 0, 0, - ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), - src1->dimension(1)); - AccessWindowStatic dst_access(dst, 0, 0, - dst->dimension(0), - dst->dimension(1)); - - if(src2 != nullptr) - { - const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; - - AccessWindowStatic src2_access(src2, 0, 0, - ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), - src2->dimension(1)); - - window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - else - { - window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel() -{ - _type = CLKernelType::GEMM; -} - -void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, - float beta, - const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - - auto padding_info = get_padding_info({ src0, dst }); - _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - _add_bias = src2 != nullptr; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_src0 = src0->num_dimensions(); - _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - IClKernel::configure_internal(win_config.second); - - // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, - // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); - - const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); - const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % lhs_info.m0; - const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; - - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. - // NOTE: This might have implications on heuristics and performance - const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); - build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); - build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); - build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); - - std::string kernel_name("gemm_mm_native"); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += (_add_bias ? "add_bias_" : ""); - _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); - _config_id += lower_string(string_from_data_type(src0->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.k0); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - src2 != nullptr ? src2->clone().get() : nullptr, - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed) - .first); - - return Status{}; -} - -void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - - if(src1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - unsigned int idx0; - if(_add_bias) - { - idx0 = 4 * num_arguments_per_2D_tensor() + 4; - } - else - { - idx0 = 3 * num_arguments_per_2D_tensor() + 3; - } - const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - if(_reinterpret_output_as_3d) - { - // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor - unsigned int idx0; - if(_add_bias) - { - idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0); - } - else - { - idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - } - const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, src0, slice); - add_2D_tensor_argument(idx, src1, slice_b); - if(_add_bias) - { - add_2D_tensor_argument(idx, src2, slice); - } - add_2D_tensor_argument(idx, dst, slice); - _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); - _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); - if(_add_bias) - { - _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[2])); - } - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h deleted file mode 100644 index cd7bf278c2..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H -#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */ -class ClGemmMatrixMultiplyNativeKernel : public IClKernel -{ -public: - ClGemmMatrixMultiplyNativeKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyNativeKernel); - /** Initialise the kernel's input and dst. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src0 Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] src1 Input tensor for the RHS matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0. - * @param[out] dst dst tensor info. Data type supported: same as @p src0 - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of the matrix bias - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported: - * lhs_info.m0: 1,2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * @param[in] rhs_info RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: same of lhs_info.k0 - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmMatrixMultiplyNativeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _add_bias{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp deleted file mode 100644 index 591834f762..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp +++ /dev/null @@ -1,421 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLUtils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/helpers/float_ops.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) - && (!gemm_info.broadcast_bias), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type"); - ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); - - const unsigned int m = gemm_info.m; - const unsigned int n = gemm_info.n; - const unsigned int k = gemm_info.k; - - TensorShape tensor_shape0{ src0->tensor_shape() }; - tensor_shape0.set(0, k); - tensor_shape0.set(1, m); - - TensorShape tensor_shape1{ src1->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) - { - const unsigned int src2_dim0 = src2->dimension(0); - const unsigned int src2_dim1 = src2->dimension(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(gemm_info.broadcast_bias) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); - } - } - - const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - - const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - - if(dst->total_size() != 0) - { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); - - TensorInfo tmp_info(*dst); - - if(reinterpret_output_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - if(src2 != nullptr) - { - const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; - - const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y; - - AccessWindowStatic src2_access(src2, 0, 0, - ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), - ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y)); - - window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel() -{ - _type = CLKernelType::GEMM; -} - -void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, - ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - - auto padding_info = get_padding_info({ src0, dst }); - _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - _add_bias = src2 != nullptr; - _export_to_cl_image = rhs_info.export_to_cl_image; - _k = gemm_info.k; - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_src0 = src0->num_dimensions(); - _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - const bool enable_mixed_precision = gemm_info.fp_mixed_precision; - const DataType data_type = src0->data_type(); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); - - const unsigned int partial_store_m0 = internal_m % lhs_info.m0; - const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); - build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); - build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); - build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); - build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); - build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE"); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); - build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION"); - build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); - build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type))); - build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); - build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); - build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - - std::string kernel_name("gemm_mm_reshaped_"); - kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; - kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; - kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += (_add_bias ? "add_bias_" : ""); - _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); - _config_id += lower_string(string_from_data_type(src0->data_type())); - _config_id += "_"; - _config_id += (enable_mixed_precision ? "mixed_precision_" : ""); - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.k0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.v0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.h0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.interleave); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.interleave); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - src2 != nullptr ? src2->clone().get() : nullptr, - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed) - .first); - - return Status{}; -} - -void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - - if(src1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); - } - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; - - cl::Image2D src1_image2d; - - if(_export_to_cl_image) - { - const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); - const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - - // LHS buffer - add_2D_tensor_argument(idx, src0, slice); - - // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) - if(_export_to_cl_image) - { - _kernel.setArg(idx++, src1_image2d); - } - else - { - add_2D_tensor_argument(idx, src1, slice_b); - } - - // Bias buffer (_add_bias == true) - add_2D_tensor_argument_if(_add_bias, idx, src2, slice); - - // dst buffer - add_2D_tensor_argument(idx, dst, slice); - - // K dimension (not used if _export_to_cl_image == true) - _kernel.setArg(idx++, static_cast(_k)); - - // LHS stride_z - _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); - - // RHS stride_z (not used if _export_to_cl_image == true) - _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); - - // Bias stride_z (if _add_bias == true) - if(_add_bias) - { - _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[2])); - } - - // dst stride_z - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); - - // Cross-plan padding (if _reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d) - { - _kernel.setArg(idx++, static_cast(total_cross_plane_pad)); - } - - // Dispatch kernel - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h deleted file mode 100644 index b8ae4b9ae3..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H -#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -#include "arm_compute/core/KernelDescriptors.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped - * - * @note The input matrices @p src0 and @p src1 must be reshaped through: - * - @ref ClGemmReshapeLhsMatrixKernel - * - @ref ClGemmReshapeRhsMatrixKernel - */ -class ClGemmMatrixMultiplyReshapedKernel : public IClKernel -{ -public: - ClGemmMatrixMultiplyReshapedKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedKernel); - /** Initialise the kernel's input and output. - * - * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag. - * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the - * multiplications. i.e. float c = (half)a * (half)b - * - * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. - * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, - * the following conditions are required: - * -# rhs_info.n0 can only be 4, 8 and 16 - * -# rhs_info.k0 can only be 4, 8 and 16 - * -# Data type can only be F32 - * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension - * -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement - * -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) - * -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT - * - * @param[in] compile_context The compile context to be used. - * @param[in] src0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4 - * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3 - * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0. - * @param[out] dst dst tensor to store the result of matrix multiplication. Data type supported: same as @p src0 - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of the matrix bias - * @param[in] lhs_info LHS matrix information used for reshaping the src0 tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.transpose: false - * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) - * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) - * rhs_info.transpose: true - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - * - * @note lhs_info.k0 must be equal to rhs_info.k0 - */ - void configure(const ClCompileContext &compile_context, - ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - bool _slide_matrix_b{ true }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _add_bias{ false }; - bool _export_to_cl_image{ false }; - unsigned int _k{ 1 }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp deleted file mode 100644 index 32ee0f9705..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp +++ /dev/null @@ -1,443 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLUtils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/helpers/float_ops.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -using ElementsProcessed = Steps; - -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0"); - ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) - && (!gemm_info.broadcast_bias), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); - - const unsigned int m = gemm_info.m; - const unsigned int n = gemm_info.n; - const unsigned int k = gemm_info.k; - - TensorShape tensor_shape1{ src1->tensor_shape() }; - tensor_shape1.set(0, n); - tensor_shape1.set(1, k); - - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) - { - const unsigned int src2_dim0 = src2->dimension(0); - const unsigned int src2_dim1 = src2->dimension(1); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0); - if(gemm_info.broadcast_bias) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); - } - } - - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); - - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); - if(gemm_info.reinterpret_input_as_3d) - { - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - - if(dst->total_size() != 0) - { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) -{ - unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; - unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; - bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - // This approach should only be used when the input/dst tensors have pad on the y direction - if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) - { - reinterpret_output_as_3d = false; - } - - // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); - - TensorInfo tmp_info(*dst); - - if(reinterpret_output_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // Configure kernel window - num_elems_processed_per_iteration_x = rhs_info.n0; - num_elems_processed_per_iteration_y = lhs_info.m0; - - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - if(src2 != nullptr) - { - const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; - - AccessWindowStatic src2_access(src2, 0, 0, - ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), - src2->dimension(1)); - - window_changed = update_window_and_padding(win, src2_access); - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKernel() -{ - _type = CLKernelType::GEMM; -} - -void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, - ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - - _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; - _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; - _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); - _add_bias = src2 != nullptr; - _export_to_cl_image = rhs_info.export_to_cl_image; - _has_pad_y = gemm_info.has_pad_y; - - auto padding_info = get_padding_info({ src0, src1, dst }); - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) - { - _reinterpret_input_as_3d = false; - _reinterpret_output_as_3d = false; - } - - // Check if we need to slide the matrix B - const unsigned int num_dimensions_src0 = src0->num_dimensions(); - _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); - - ElementsProcessed num_elements_processed{}; - - // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true, - // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. - // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m - const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); - - // These variables are used only if gemm_info.has_pad_y == true - const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); - const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); - - // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. - // NOTE: This might have implications on heuristics and performance - const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = internal_m % internal_m0; - const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); - build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); - build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); - build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); - build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); - build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); - build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); - build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); - build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); - build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); - build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); - if(_has_pad_y) - { - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); - } - - std::string kernel_name("gemm_mm_reshaped_only_rhs_"); - kernel_name += rhs_info.transpose ? "t" : "nt"; - kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += (_has_pad_y ? "" : "no_pad_y_"); - _config_id += (_add_bias ? "add_bias_" : ""); - _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); - _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); - _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); - _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); - _config_id += lower_string(string_from_data_type(src0->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gemm_info.k); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.k0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.h0); - _config_id += "_"; - _config_id += support::cpp11::to_string(rhs_info.interleave); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) -{ - ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - src2 != nullptr ? src2->clone().get() : nullptr, - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed) - .first); - - return Status{}; -} - -void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - - if(src1->info()->num_dimensions() < 3) - { - // The stride_z for matrix B must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); - } - - const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u; - const size_t rhs_idx_batch_size = 2u; - const size_t bia_idx_batch_size = 2u; - const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u; - - Window slice = window.first_slice_window_3D(); - Window slice_matrix_b = slice; - - slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - // Get cross plane pads - const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom; - const unsigned int total_cross_plane_pad_out = dst->info()->padding().top + dst->info()->padding().bottom; - - // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor - ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0))); - - cl::Image2D src1_image2d; - - if(_export_to_cl_image) - { - const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); - const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch); - } - - do - { - Window slice_b = slice; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) - { - slice_b = slice_matrix_b; - } - - unsigned int idx = 0; - - // LHS buffer - add_2D_tensor_argument(idx, src0, slice); - - // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) - if(_export_to_cl_image) - { - _kernel.setArg(idx++, src1_image2d); - } - else - { - add_2D_tensor_argument(idx, src1, slice_b); - } - - // Bias buffer (_add_bias == true) - add_2D_tensor_argument_if(_add_bias, idx, src2, slice); - - // dst buffer - add_2D_tensor_argument(idx, dst, slice); - - // LHS stride_z - _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[lhs_idx_batch_size])); - - // RHS stride_z (not used if _export_to_cl_image == true) - _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[rhs_idx_batch_size])); - - // Bias stride_z (if _add_bias == true) - if(_add_bias) - { - _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[bia_idx_batch_size])); - } - - // dst stride_z - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[out_idx_batch_size])); - - // Cross-plan padding (if _reinterpret_input_as_3d = true) - if(_reinterpret_input_as_3d && _has_pad_y) - { - _kernel.setArg(idx++, static_cast(total_cross_plane_pad_lhs)); - } - - // Cross-plan padding (if reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d && _has_pad_y) - { - _kernel.setArg(idx++, static_cast(total_cross_plane_pad_out)); - } - - enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h deleted file mode 100644 index 3d6164eca9..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H -#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -#include "arm_compute/core/KernelDescriptors.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to multiply matrices when only the input matrix RHS (src1) has been reshaped - * - * @note The input matrix src1 must be reshaped through @ref ClGemmReshapeRhsMatrixKernel - */ -class ClGemmMatrixMultiplyReshapedOnlyRhsKernel : public ICLKernel -{ -public: - ClGemmMatrixMultiplyReshapedOnlyRhsKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsKernel); - /** Initialise the kernel's input and output. - * - * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. - * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, - * the following conditions are required: - * -# rhs_info.n0 can only be 4, 8 and 16 - * -# rhs_info.k0 can only be 4, 8 and 16 - * -# Data type can only be F32 - * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension - * -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement - * -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) - * -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT - * - * @param[in] compile_context The compile context to be used. - * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). - * The number of dimensions for the LHS matrix must be less or equal than 4. - * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. - * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0. - * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0 - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of the matrix bias - * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported: - * lhs_info.m0: 1,2,3,4,5,6,7,8 - * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: - * rhs_info.k0: 2,3,4,8,16 - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.transpose: true,false - * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices - */ - void configure(const ClCompileContext &compile_context, - ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _add_bias{ false }; - bool _export_to_cl_image{ false }; - bool _has_pad_y{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp deleted file mode 100644 index f92945e2a4..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); - - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) -{ - const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0; - const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0; - bool window_changed = false; - - TensorInfo tmp_info(*src); - - if(reinterpret_input_as_3d) - { - // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(src->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - // dst auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d))); - - // Configure window - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - Window win_in = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowStatic src_access(src, 0, 0, - src->dimension(0), - src->dimension(1)); - AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1)); - - window_changed = update_window_and_padding(win_in, src_access) || // window used by the execute_window_loop - update_window_and_padding(win, dst_access); // window used to update the padding requirements of dst tensor - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win.collapse(win, Window::DimZ); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); - - auto padding_info = get_padding_info({ src }); - - _reinterpret_input_as_3d = reinterpret_input_as_3d; - - const unsigned int src_w = src->dimension(0); - const unsigned int src_h = _reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1); - const unsigned int partial_load_m0 = src_h % lhs_info.m0; - const unsigned int partial_load_k0 = src_w % lhs_info.k0; - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); - build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w)); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h)); - build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE"); - build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(src->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(src->dimension(2))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); - build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0)); - build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0)); - - std::string kernel_name("gemm_reshape_lhs_matrix_"); - kernel_name += lhs_info.transpose ? "t" : "nt"; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, lhs_info, reinterpret_input_as_3d); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - - // Set config_id for enabling LWS tuning - _config_id = "gemm_reshape_lhs_matrix_"; - _config_id += (_reinterpret_input_as_3d ? "3d_" : ""); - _config_id += lower_string(string_from_data_type(src->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.k0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.v0); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.interleave); - _config_id += "_"; - _config_id += support::cpp11::to_string(lhs_info.transpose); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), lhs_info, reinterpret_input_as_3d).first); - - return Status{}; -} - -void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - Window slice = window.first_slice_window_3D(); - - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the src has to be reinterpreted as 3D tensor - const unsigned int idx0 = 2 * num_arguments_per_3D_tensor(); - const unsigned int total_cross_plane_pad = src->info()->padding().top + src->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h deleted file mode 100644 index 73d811f3c3..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H -#define ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication. - * In particular, this function splits the src matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and - * stores each one in the dst matrix unrolling the values - */ -class ClGemmReshapeLhsMatrixKernel : public ICLKernel -{ -public: - ClGemmReshapeLhsMatrixKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeLhsMatrixKernel); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Input tensor. Data types supported: All - * @param[out] dst Output tensor. Data type supported: same as @p src - * @param[in] lhs_info LHS matrix information to be used for reshaping. This object contains all the necessary - * information to reshape the src tensor. Only the following values are supported: - * lhs_info.m0: 2,3,4,5,6,7,8 - * lhs_info.k0: 2,3,4,8,16 - * lhs_info.v0: greater than 0 - * lhs_info.transpose: true, false - * lhs_info.interleave: true, false - * @param[in] reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - bool _reinterpret_input_as_3d{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp deleted file mode 100644 index 3a6f3c7e8f..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0"); - ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); - ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose)); - - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - - if(rhs_info.export_to_cl_image) - { - const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info)); - } - - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) -{ - const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0; - const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0; - bool window_changed = false; - - // dst auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info))); - - // Configure window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - - window_changed = update_window_and_padding(win, src_access); - - if(rhs_info.export_to_cl_image) - { - gemm::update_padding_for_cl_image(dst); - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win.collapse(win, Window::DimZ); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, collapsed); -} -} // namespace - -ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, rhs_info)); - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); - build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); - build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE"); - build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE"); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); - - std::string kernel_name("gemm_reshape_rhs_matrix_"); - kernel_name += rhs_info.transpose ? "t" : "nt"; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, rhs_info); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); -} - -Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first); - - return Status{}; -} - -void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - Window slice = window.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h deleted file mode 100644 index 27f80d3428..0000000000 --- a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H -#define ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication - * In particular, this kernel splits the src matrix in blocks of size K0xN0 and stores each one in - * the dst matrix unrolling the values */ -class ClGemmReshapeRhsMatrixKernel : public ICLKernel -{ -public: - ClGemmReshapeRhsMatrixKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeRhsMatrixKernel); - /** Initialise the kernel's input and output. - * - * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor, - * required to create a OpenCL image object from buffer in @ref ClGemmMatrixMultiplyReshapedKernel and in @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel - * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required: - * -# rhs_info.n0 can only be 4, 8 and 16 - * -# rhs_info.k0 can only be 4, 8 and 16 - * -# Data type can only be F32, F16 - * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension - * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) - * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT - * -# The output tensor should be only consumed by @ref ClGemmMatrixMultiplyReshapedKernel or @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Input tensor. Data types supported: All - * @param[out] dst Output tensor. Data type supported: same as @p src - * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary - * information to reshape the src tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) - * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) - * rhs_info.h0: greater than 0 - * rhs_info.transpose: true, false - * rhs_info.interleave: true, false - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp deleted file mode 100644 index 9ff30eedcd..0000000000 --- a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY)); - - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0)); - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); - } - ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4); - - return Status{}; -} -} // namespace - -ClHeightConcatenateKernel::ClHeightConcatenateKernel() - : _height_offset(0) -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst)); - return Status{}; -} - -void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst)); - - auto padding_info = get_padding_info({ src, dst }); - - _height_offset = height_offset; - - // Add build options - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); - - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset)); - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2))); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) - { - const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); - build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); - } - - // Create kernel - _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options()); - // Configure kernel window - - // The window needs to be based on src as we copy all the heights of src - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win.collapse(win, Window::DimZ)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, window); - add_4D_tensor_argument(idx, dst, window); - enqueue(queue, *this, window, lws_hint()); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h deleted file mode 100644 index 0733078fc2..0000000000 --- a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H -#define ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the height concatenate kernel. - * The source tensor will be concatenated into the destination tensor. - */ -class ClHeightConcatenateKernel : public IClKernel -{ -public: - ClHeightConcatenateKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClHeightConcatenateKernel); - /** Initialise the kernel's source and destination - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: All. - * @param[in] height_offset The starting offset on the Y axis for the dst tensor. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - * - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClHeightConcatenateKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; - -private: - unsigned int _height_offset; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/core/gpu/cl/kernels/ClIm2ColKernel.cpp deleted file mode 100644 index 61ee443aa5..0000000000 --- a/src/core/gpu/cl/kernels/ClIm2ColKernel.cpp +++ /dev/null @@ -1,431 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClIm2ColKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -#include -#include -#include - -namespace arm_compute -{ -using namespace misc::shape_calculator; -namespace opencl -{ -namespace kernels -{ -namespace -{ -struct Im2ColConfiguration -{ - std::string kernel_name{}; - std::set build_options{}; - unsigned int num_elems_processed_per_iteration{}; - bool is_padding_required_nchw{}; -}; - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_groups) -{ - const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); - - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); - ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::NHWC && num_groups > 1); - ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(channel_idx) % num_groups) != 0); - - // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions - const unsigned int width_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - const unsigned total_width = src->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right(); - const unsigned total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom(); - ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height)); - - if(dst->total_size() > 0) - { - const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups); - - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape)); - - const DataLayout data_layout = src->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int input_width = src->dimension(width_idx); - const unsigned int input_height = src->dimension(height_idx); - - // Configure the execute window based on the selected optimal OpenCL kernel - bool window_changed = false; - Window win; - - if(data_layout == DataLayout::NHWC) - { - win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); - } - else - { - if(is_padding_required_nchw) - { - const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); - win = calculate_max_window(*src, - Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second)); - AccessWindowStatic input_access(src, - -border.left, - -border.top, - ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration), - input_height + border.bottom); - window_changed = window_changed || update_window_and_padding(win, input_access); - } - else - { - // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so - // update_window_and_padding() can be skipped - win = calculate_max_window(*src, Steps()); - } - } - - // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension - win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start()); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - -Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups) -{ - const DataLayout data_layout = src->data_layout(); - const DataType data_type = src->data_type(); - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const unsigned int input_width = src->dimension(width_idx); - const unsigned int input_height = src->dimension(height_idx); - const unsigned int input_channel = src->dimension(channel_idx); - - const std::pair convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); - - // Im2Col configuration - std::string kernel_name = "im2col_generic_"; - CLBuildOptions build_opts; - unsigned int num_elems_processed_per_iteration = 1; - bool is_padding_required_nchw = false; - const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); - - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size())); - build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width)); - build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height)); - build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first)); - build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second)); - build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first)); - build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second)); - build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); - build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); - build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right())); - build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom())); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width)); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height)); - build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel)); - build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); - build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); - build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups)); - build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0"); - build_opts.add_option_if(has_bias, "-DHAS_BIAS"); - - if(data_layout == DataLayout::NHWC) - { - num_elems_processed_per_iteration = std::min(2U, input_channel); - is_padding_required_nchw = false; - - // Only the 3x3 and 9x9 cases are optimized for NHWC - if(kernel_dims == Size2D(3U, 3U)) - { - kernel_name = "im2col3x3_"; - } - else if(kernel_dims == Size2D(9U, 9U)) - { - kernel_name = "im2col9x9_"; - } - - // Get boundary vector (the first/last vector with potentially a partial vector size) size - // If input_channel is a multiple of num_elems_processed_per_iteration, the boundary vec size is the (full) vector size - // otherwise, the boundary vec size is the (partial) remainder vector size - const unsigned int vec_size = num_elems_processed_per_iteration; - const unsigned int partial_vec_size = input_channel % vec_size; - const unsigned int boundary_vec_size = vec_size - ((vec_size - partial_vec_size) % vec_size); - build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vec_size)); - build_opts.add_option("-DBOUNDARY_VECTOR_SIZE=" + support::cpp11::to_string(boundary_vec_size)); - } - else - { - if(dilation == Size2D(1U, 1U)) - { - const bool squared_im2col = kernel_dims.width == kernel_dims.height; - if(squared_im2col) - { - // Check if we can run an optimized im2col for NCHW - switch(kernel_dims.width) - { - case 1: - // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false - if(conv_info.stride().first == 1 && !conv_info.has_padding()) - { - kernel_name = "im2col1x1_stridex1_"; - num_elems_processed_per_iteration = 4; - is_padding_required_nchw = true; - } - break; - case 3: - kernel_name = "im2col3x3_"; - num_elems_processed_per_iteration = 1; - is_padding_required_nchw = true; - break; - case 5: - kernel_name = "im2col5x5_"; - num_elems_processed_per_iteration = 1; - is_padding_required_nchw = true; - break; - case 11: - // Optimized im2col11x11 if pad_x = pad_y = 0 - if(!conv_info.has_padding()) - { - kernel_name = "im2col11x11_padx0_pady0_"; - num_elems_processed_per_iteration = 1; - is_padding_required_nchw = true; - } - break; - default: - kernel_name = "im2col_generic_"; - num_elems_processed_per_iteration = 1; - is_padding_required_nchw = false; - break; - } - } - else if(kernel_dims.width > 1 && !conv_info.has_padding()) - { - kernel_name = "im2col_generic_padx0_pady0_"; - num_elems_processed_per_iteration = 1; - is_padding_required_nchw = false; - - // Optimized im2col is performed using one or more vector operations with the specified vector size - // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4 - // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3. - // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3. - // Using the vector size of 8, however, may be faster. - // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0 - // is used instead.) - const size_t vector_size = std::min(static_cast(4), kernel_dims.width); - const size_t width_mod_vector_size = kernel_dims.width % vector_size; - build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); - build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size)); - } - } - } - - // Append the data layout to the kernel_name - kernel_name += lower_string(string_from_data_layout(data_layout)); - - Im2ColConfiguration im2col_config; - im2col_config.kernel_name = kernel_name; - im2col_config.build_options = build_opts.options(); - im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration; - im2col_config.is_padding_required_nchw = is_padding_required_nchw; - - return im2col_config; -} -} // namespace - -ClIm2ColKernel::ClIm2ColKernel() - : _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, - const Size2D &dilation, - unsigned int num_groups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); - - auto padding_info = get_padding_info({ src, dst }); - _data_layout = src->data_layout(); - - const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const unsigned int input_width = src->dimension(width_idx); - const unsigned int input_height = src->dimension(height_idx); - - // Select and configure the optimal OpenCL kernel to run. - // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration - // and the padding requirement flag - Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); - - // Create kernel - _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options); - - _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); - _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration; - _kernel_dims = kernel_dims; // Only needed by the Tuner - _conv_info = conv_info; // Only needed by the Tuner - _num_groups = num_groups; - - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, - im2col_config.is_padding_required_nchw, num_groups); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - IClKernel::configure_internal(win_config.second); - - // Set config_id for enabling LWS tuning - _config_id = im2col_config.kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(src->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(num_groups); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += lower_string(string_from_data_layout(_data_layout)); - - ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info)); -} - -Status ClIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_groups) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); - Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, - im2col_config.is_padding_required_nchw, num_groups) - .first); - return Status{}; -} - -void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window); - ARM_COMPUTE_ERROR_ON(tensors.empty()); - - // Get initial windows - // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - window_collapsed.set_dimension_step(Window::DimZ, 1); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - Window window_output; - window_output.use_tensor_dimensions(dst->info()->tensor_shape()); - - const Window first_slice_3d = window_collapsed.first_slice_window_3D(); - - Window slice = first_slice_3d; - Window slice_in = first_slice_3d; - Window slice_out = window_output.first_slice_window_2D(); - - if(_data_layout == DataLayout::NHWC) - { - const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3); - const int num_batches = tmp_win[3].end(); - - slice.set(1, Window::Dimension(0, static_cast(dst->info()->tensor_shape()[1]), 1)); - slice.set(2, Window::Dimension(0, static_cast(num_batches), 1)); - } - else - { - slice.set(0, Window::Dimension(0, static_cast(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration)); - slice.set(1, Window::Dimension(0, static_cast(_convolved_dims.second), 1)); - // Note: In case of NCHW the 3rd dimension is already set collapsing the input window - } - - // Setup input slice - // The dimensions of the input are increased within the OpenCL kernel - slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - // Setup output slice - // The dimensions of the output are increased within the OpenCL kernel - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - - unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); - _kernel.setArg(idx++, static_cast(src->info()->strides_in_bytes()[3])); - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)])); - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice_in); - if(_num_groups == 1) - { - add_2D_tensor_argument(idx, dst, slice_out); - } - else - { - add_3D_tensor_argument(idx, dst, slice_out); - } - enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClIm2ColKernel.h b/src/core/gpu/cl/kernels/ClIm2ColKernel.h deleted file mode 100644 index d1443f0434..0000000000 --- a/src/core/gpu/cl/kernels/ClIm2ColKernel.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_IM2COL_KERNEL_H -#define ARM_COMPUTE_CL_IM2COL_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/Size2D.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the im2col reshape kernel. - * - * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column. - * It is used to transform a convolution to a plain matrix multiplication. - * - * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have: - * @f[ - * \left( \begin{array}{cccc} - * a00 & a01 & a02 & a03 \\ - * a10 & a11 & a12 & a13 \\ - * a20 & a21 & a22 & a23 \\ - * a30 & a31 & a32 & a33 \\ - * \end{array} \right) - * = - * \left( \begin{array}{ccccccccc} - * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\ - * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\ - * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\ - * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\ - * \end{array} \right) - * @f] - */ -class ClIm2ColKernel : public IClKernel -{ -public: - /** Default constructor */ - ClIm2ColKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClIm2ColKernel); - /** Set the input and output of the kernel. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[out] dst The output tensor info. First 2 lower dimensions represent a transform of each 3D input, - * while every dimension above represents a batch. Data types supported: Same as @p input - * @param[in] kernel_dims The kernel dimensions (width and height). - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] has_bias In case biases are provided expands the matrix with 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, - const Size2D &dilation = Size2D(1U, 1U), - unsigned int num_groups = 1); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClIm2ColKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U), - unsigned int num_groups = 1); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -public: - DataLayout _data_layout; - std::pair _convolved_dims; - unsigned int _num_elems_processed_per_iteration; - Size2D _kernel_dims; - PadStrideInfo _conv_info; - unsigned int _num_groups; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_IM2COL_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClMulKernel.cpp b/src/core/gpu/cl/kernels/ClMulKernel.cpp deleted file mode 100644 index 7c4dddc20e..0000000000 --- a/src/core/gpu/cl/kernels/ClMulKernel.cpp +++ /dev/null @@ -1,439 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClMulKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(overflow_policy); - ARM_COMPUTE_UNUSED(rounding_policy); - - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); - - // Check whether it is in_place calculation - const bool in_place = (src1 == dst) || (src2 == dst); - const bool src1_in_place = in_place && (src1 == dst); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - // Validate in case of configured dst - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8), - "Dst can only be U8 if both src are U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8), - "Dst can only be QASYMM8 if both src are QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED), - "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), - "Dst can only be QSYMM16 if both src are QSYMM16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32), - "Dst must be S32 if source tensors are S32"); - if(in_place) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0), - "Wrong shape for dst, cannot do in_place calculation"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), - "Wrong shape for dst"); - } - } - - return Status{}; -} -} // namespace - -ClMulKernel::ClMulKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, - scale, overflow_policy, rounding_policy, act_info)); - - auto padding_info = get_padding_info({ src1, src2, dst }); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); - - int scale_int = -1; - // Extract sign, exponent and mantissa - int exponent = 0; - float normalized_mantissa = std::frexp(scale, &exponent); - // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 - // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 - // Moreover, it will be negative as we deal with 1/2^n - if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) - { - // Store the positive exponent. We know that we compute 1/2^n - // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 - scale_int = std::abs(exponent - 1); - } - - std::string acc_type; - // Check if it has float src and dst - if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type())) - { - scale_int = -1; - acc_type = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half"; - } - else - { - if(src1->element_size() == 4 || src2->element_size() == 4) - { - // use 64 bit accumulator for 32-bit input - acc_type = "long"; - } - else if(src1->element_size() == 2 || src2->element_size() == 2) - { - // Use 32-bit accumulator for 16-bit input - acc_type = "int"; - } - else - { - // Use 16-bit accumulator for 8-bit input - acc_type = "ushort"; - } - } - - const bool is_quantized = is_data_type_quantized(src1->data_type()); - const unsigned int vec_size = adjust_vec_size(16 / dst->element_size(), dst->dimension(0)); - const unsigned int vec_size_leftover = dst->dimension(0) % vec_size; - - // Set kernel build options - std::string kernel_name = "pixelwise_mul"; - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type())); - build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type())); - build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); - build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); - build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); - if(is_quantized && (dst->data_type() != DataType::S32)) - { - const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - build_opts.add_option_if(is_data_type_quantized_asymmetric(src1->data_type()), - "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset)); - build_opts.add_option_if(is_data_type_quantized_asymmetric(src2->data_type()), - "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset)); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dst->data_type()), - "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); - build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); - kernel_name += "_quantized"; - } - else - { - kernel_name += (scale_int >= 0) ? "_int" : "_float"; - build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE"); - build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte"); - build_opts.add_option("-DACC_DATA_TYPE=" + acc_type); - if(act_info.enabled()) - { - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); - build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); - build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); - } - } - - // Check whether it is in_place calculation - const bool in_place = (src1 == dst) || (src2 == dst); - const bool src1_in_place = in_place && (src1 == dst); - build_opts.add_option_if(in_place, "-DIN_PLACE"); - build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE"); - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set scale argument - unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters - - if(scale_int >= 0 && !is_quantized) - { - _kernel.setArg(idx++, scale_int); - } - else - { - _kernel.setArg(idx++, scale); - } - - Window win = calculate_max_window(*dst, Steps(vec_size)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(dst->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); -} - -Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info)); - - return Status{}; -} - -void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); - - const TensorShape &in_shape1 = src_0->info()->tensor_shape(); - const TensorShape &in_shape2 = src_1->info()->tensor_shape(); - const TensorShape &out_shape = dst->info()->tensor_shape(); - - bool can_collapse = true; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; - - const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - // Check whether it is in_place calculation - const bool in_place = (src_0 == dst) || (src_1 == dst); - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src_0, slice_input1); - add_3D_tensor_argument(idx, src_1, slice_input2); - if(!in_place) - { - add_3D_tensor_argument(idx, dst, slice); - } - enqueue(queue, *this, slice, lws_hint()); - - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); -} - -namespace -{ -constexpr unsigned int vec_size_complex = 1; - -Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); - - // Validate in case of configured dst - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); - } - - return Status{}; -} -} // namespace - -ClComplexMulKernel::ClComplexMulKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info)); - - auto padding_info = get_padding_info({ src1, src2, dst }); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); - - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - if(act_info.enabled()) - { - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); - build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); - build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); - } - - // Create kernel - _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options()); - - Window win = calculate_max_window(*dst, Steps(vec_size_complex)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info)); - - return Status{}; -} - -void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - const TensorShape &in_shape1 = src_0->info()->tensor_shape(); - const TensorShape &in_shape2 = src_1->info()->tensor_shape(); - const TensorShape &out_shape = dst->info()->tensor_shape(); - - bool can_collapse = true; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; - - const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src_0, slice_input1); - add_3D_tensor_argument(idx, src_1, slice_input2); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClMulKernel.h b/src/core/gpu/cl/kernels/ClMulKernel.h deleted file mode 100644 index 2ee182b932..0000000000 --- a/src/core/gpu/cl/kernels/ClMulKernel.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_MUL_KERNEL_H -#define ARM_COMPUTE_CL_MUL_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the pixelwise multiplication kernel. - * - * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead. - * -*/ -class ClMulKernel : public IClKernel -{ -public: - ClMulKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMulKernel); - /** Initialise the kernel's src and dst. - * - * Valid configurations (Input1,Input2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,U8) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - (QSYMM16,QSYMM16) -> S32 - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32 - * @param[in] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32 - * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32 - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClMulKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; - -/** Interface for the complex pixelwise multiplication kernel. */ -class ClComplexMulKernel : public ICLKernel -{ -public: - ClComplexMulKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexMulKernel); - /** Initialise the kernel's src and dst. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2. - * @param[in] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[out] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClComplexMulKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_MUL_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClPermuteKernel.cpp b/src/core/gpu/cl/kernels/ClPermuteKernel.cpp deleted file mode 100644 index 722bf454f2..0000000000 --- a/src/core/gpu/cl/kernels/ClPermuteKernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClPermuteKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -TensorShape get_dst_shape(const ITensorInfo *src, const PermutationVector &perm) -{ - TensorShape dst_shape = src->tensor_shape(); - permute(dst_shape, perm); - return dst_shape; -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() < 1 || src->num_dimensions() > 4, - "Permutation up to 4-D src tensor is supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4, - "Permutation vector size should be less than or equal to 4"); - for(const auto &p : perm) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values"); - } - - // Validate configured dst - if(dst->total_size() != 0) - { - const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - return Status{}; -} -} // namespace - -ClPermuteKernel::ClPermuteKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - auto padding_info = get_padding_info({ src, dst }); - const TensorShape dst_shape = get_dst_shape(src, perm); - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm)); - - _perm = perm; - - // Create kernel - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type()))); - build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2))); - // New positions of width(W), height(H), channel(C) and batch(D) based on permutation vector - build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0)); - build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1)); - build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2)); - build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3)); - - _kernel = create_kernel(compile_context, "permute", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - - ICLKernel::configure_internal(win); - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm)); - - return Status{}; -} - -void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); - - // Setup dst slice - Window slice_out(slice_in); - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - slice_out.set(3, Window::Dimension(0, 0, 0)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, slice_in); - add_4D_tensor_argument(idx, dst, slice_out); - enqueue(queue, *this, slice_in, lws_hint()); - } - while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClPermuteKernel.h b/src/core/gpu/cl/kernels/ClPermuteKernel.h deleted file mode 100644 index 839e224ee4..0000000000 --- a/src/core/gpu/cl/kernels/ClPermuteKernel.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_PERMUTE_KERNEL_H -#define ARM_COMPUTE_CL_PERMUTE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to perform tensor permutation. - * - * Permutes given a permutation vector - */ -class ClPermuteKernel : public IClKernel -{ -public: - ClPermuteKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPermuteKernel); - /** Set the src and dst of the kernel. - * - * @note Arbitrary permutation vectors are supported with rank not greater than 4 - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The src tensor info. Data types supported: All. - * @param[in] dst The dst tensor info. Data types supported: Same as @p src - * @param[in] perm Permutation vector - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClPermuteKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - PermutationVector _perm{}; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_PERMUTE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp deleted file mode 100644 index e522814b6d..0000000000 --- a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp +++ /dev/null @@ -1,509 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClPool2dKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -using namespace arm_compute::misc::shape_calculator; - -namespace -{ -// Internal window config info -using ClPoolingConfig = std::pair; //num_elems_processed_per_iteration, border_size - -void auto_init(const ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, PoolingLayerInfo pool_info) -{ - TensorShape out_shape = compute_pool_shape(*src, pool_info); - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape)); - if(indices) - { - auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32)); - } -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2), - "Unsupported combination of parameters!"); - - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const bool is_global_pooling = pool_info.is_global_pooling; - unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - int output_width = 0; - int output_height = 0; - std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], - pool_size_x, pool_size_y, pool_info.pad_stride_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); - - // Check indices - if(indices) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); - - if(indices->total_size() != 0) - { - TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info); - } - } - - // Checks performed when dst is configured - if(dst->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); - } - - return Status{}; -} - -std::tuple validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Get data layout - const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - int pool_stride_x = 0; - int pool_stride_y = 0; - unsigned int pooled_w = 0; - unsigned int pooled_h = 0; - int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_pad_right = pad_stride_info.pad_right(); - const int pool_pad_top = pad_stride_info.pad_top(); - const int pool_pad_left = pad_stride_info.pad_left(); - const int pool_pad_bottom = pad_stride_info.pad_bottom(); - BorderSize border_size = BorderSize(); - - auto_init(src, dst, indices, pool_info); - pooled_w = dst->tensor_shape()[idx_width]; - pooled_h = dst->tensor_shape()[idx_height]; - - const DataType data_type = src->data_type(); - - const int src_width = src->dimension(idx_width); - const int src_height = src->dimension(idx_height); - - unsigned int num_elems_processed_per_iteration = 0; - bool window_changed = false; - Window win{}; - switch(data_layout) - { - case DataLayout::NCHW: - { - // Initialize border size - border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); - // Change the number of elements processed per iteration - // for pooling 3x3 with stride less equal than 3 - const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type); - num_elems_processed_per_iteration = can_optimize ? 4 : 1; - const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x; - - // Number of iterations in X dimension - const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; - - // Upper limit for the number of right/bottom border elements that are accessed - const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width; - const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height; - - border_size.right = std::max(upper_bound_w, pool_pad_right); - border_size.bottom = std::max(upper_bound_h, pool_pad_bottom); - - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - - AccessWindowRectangle src_access(src, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y, - pool_stride_x, pool_stride_y); - AccessWindowHorizontal dst_access(dst, 0, num_elems_processed_per_iteration); - - // Update indices window - if(indices) - { - AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, src_access, dst_access, indices_access); - indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape())); - } - else - { - window_changed = update_window_and_padding(win, src_access, dst_access); - } - - dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); - break; - } - case DataLayout::NHWC: - { - const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4; - - // Initialize border size - border_size = BorderSize(); - num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0)); - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - break; - } - default: - ARM_COMPUTE_ERROR("Not implemented"); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_tuple(err, win, ClPoolingConfig(num_elems_processed_per_iteration, border_size)); -} -} // namespace - -ClPool2dKernel::ClPool2dKernel() -{ - _type = CLKernelType::POOL; -} - -BorderSize ClPool2dKernel::border_size() const -{ - return _border_size; -} - -void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - auto padding_info = get_padding_info({ src, dst, indices }); - - // Set instance variables - _pool_info = pool_info; - _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - int pool_stride_x = 0; - int pool_stride_y = 0; - const PoolingType pool_type = pool_info.pool_type; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); - const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - const bool exclude_padding = pool_info.exclude_padding; - std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_pad_top = pad_stride_info.pad_top(); - const int pool_pad_left = pad_stride_info.pad_left(); - - // Set build options - CLBuildOptions build_opts; - const DataType data_type = src->data_type(); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, pool_info, indices); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - ICLKernel::configure_internal(std::get<1>(win_config)); - - ClPoolingConfig pooling_config = std::get<2>(win_config); - _num_elems_processed_per_iteration = pooling_config.first; - _border_size = pooling_config.second; - - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)); - - // Tensor paddings are used to calculate the indicies for MAX pooling - if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) - { - build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(src->padding().left)); - build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(src->padding().right)); - build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(src->padding().top)); - build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(src->padding().bottom)); - build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(src->dimension(idx_channel))); - build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); - build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); - } - - if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info()) - { - const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); - build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); - } - - // Check dst dimensions - auto_init(src, dst, indices, pool_info); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices)); - - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type)); - build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)); - build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)); - build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left)); - build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top)); - build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x)); - build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y)); - - // Set the initial value for the pooling operation accordingly with the data type - if(pool_type == PoolingType::MAX) - { - if(is_data_type_quantized(data_type)) - { - PixelValue type_min{}; - std::tie(type_min, std::ignore) = get_min_max(data_type); - build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get())); - } - else - { - build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits::lowest())); - } - } - else - { - // Pool AVG and Pool L2 initial value - build_opts.add_option("-DINITIAL_VALUE=0"); - } - - build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); - build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); - - // Create kernel - switch(_data_layout) - { - case DataLayout::NCHW: - { - const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision; - const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX); - const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type); - build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type); - build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION"); - - if(pool_type != PoolingType::MAX) - { - build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); - } - - if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type)) - { - // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where - // each thread computes 4 dst elements - const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3); - - std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_") - + support::cpp11::to_string(pool_size_x); - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) - { - // For max pooling with pool2x2, store indicies which will be used in max unpooling - if(data_type == DataType::F32) - { - std::string kernel_name = "pooling_layer_2_nchw_indices_fp32"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - else if(data_type == DataType::F16) - { - std::string kernel_name = "pooling_layer_2_nchw_indices_fp16"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - } - else // Run general case - { - std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - break; - } - case DataLayout::NHWC: - { - // Floating point mixed precision is support on F16 only - const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; - - // Wider accumulation is required to avoid accuracy loss - // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation) - // Cast 2: Quantized (int8/uint8 src data and int32 accumulation ) - DataType acc_data_type = data_type; - - if(use_fp_mixed_precision) - { - acc_data_type = DataType::F32; - } - else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) - { - acc_data_type = DataType::S32; - } - - build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type)); - build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION"); - build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); - build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); - build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel))); - build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size))); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); - if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type)) - { - build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX"); - - std::string kernel_name = "pooling_layer_2x2_nhwc"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - else - { - std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc"; - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - } - break; - } - default: - ARM_COMPUTE_ERROR("Not implemented"); - } - - // Set config_id for enabling LWS tuning - _config_id = "pooling_layer_"; - _config_id += lower_string(string_from_data_type(data_type)); - _config_id += "_"; - _config_id += lower_string(string_from_data_layout(_data_layout)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(idx_width)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(idx_height)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(idx_channel)); - _config_id += "_"; - _config_id += lower_string(string_from_data_layout(src->data_layout())); - - ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info)); -} - -Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info))); - - return Status{}; -} - -void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - unsigned int pool_stride_x = 0; - unsigned int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride(); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_0)); - auto indices = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST_1)); - - // Collapse window - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - - switch(_data_layout) - { - case DataLayout::NCHW: - { - Window slice = window_collapsed.first_slice_window_3D(); - do - { - // Upsample src by pool size - Window in_slice(slice); - in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(), - (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x, - pool_stride_x * _num_elems_processed_per_iteration)); - in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(), - (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y, - pool_stride_y)); - - // Set srcs - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, in_slice); - add_3D_tensor_argument(idx, dst, slice); - if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2))) - { - add_3D_tensor_argument(idx, indices, slice); - } - enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); - break; - } - case DataLayout::NHWC: - { - const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3); - - Window slice = window_collapsed.first_slice_window_4D(); - Window in_slice = window_collapsed.first_slice_window_4D(); - in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration)); - in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); - in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); - in_slice.set(3, Window::Dimension(0, batch_size, 1)); - do - { - // Set srcs - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, in_slice); - add_4D_tensor_argument(idx, dst, slice); - if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) - { - add_4D_tensor_argument(idx, indices, slice); - } - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice)); - break; - } - default: - ARM_COMPUTE_ERROR("Not implemented"); - } -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.h b/src/core/gpu/cl/kernels/ClPool2dKernel.h deleted file mode 100644 index ab8c56a857..0000000000 --- a/src/core/gpu/cl/kernels/ClPool2dKernel.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H -#define ARM_COMPUTE_CL_POOL2D_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the pooling layer kernel */ -class ClPool2dKernel : public IClKernel -{ -public: - ClPool2dKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel); - - /** Configure kernel for a given list of arguments - * - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. - * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClPool2dKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -public: - PoolingLayerInfo _pool_info{}; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - BorderSize _border_size{ 0 }; - unsigned int _num_elems_processed_per_iteration{ 1 }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp deleted file mode 100644 index 7900489db7..0000000000 --- a/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClQuantizeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" - -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - - // Output must always be initialized - ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - - return Status{}; -} -} // namespace - -ClQuantizeKernel::ClQuantizeKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - auto padding_info = get_padding_info({ src, dst }); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - - const int vec_size_x = 16 / src->element_size(); - const int input_width_x = src->tensor_shape().x(); - const bool multi_access_x = (input_width_x / vec_size_x > 0); - - const UniformQuantizationInfo qinfo = dst->quantization_info().uniform(); - const DataType output_data_type = dst->data_type(); - - float scale_to_apply = qinfo.scale; - int32_t offset_to_apply = qinfo.offset; - if(is_data_type_quantized_asymmetric(src->data_type())) - { - /* - * In case of requantization of a quantized input tensor to an output tensor with another quantization - * instead of of apply dequantization and then a quantization functions, we just compute new scale and - * offset to apply. - * - * Assuming: - * - q_i as input quantized value - * - q_o as output quantized value - * - z_i as input quantization offset value - * - z_o as output quantization offset value - * - s_i as input quantization scale value - * - s_o as output quantization scale value - * - z_n as new quantization offset value - * - s_n as new quantization scale value - * - * q_o = ( q_i - z_i ) * s_i / s_o + z_o - * - * We can rewrite the formula as: - * - * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o - * - * q_o = q_i / s_n + z_n - * - * Where: - * - * s_n = s_o / s_i - * - * z_n = - z_i * s_i / s_o + z_o - * - */ - const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform(); - scale_to_apply /= qinfo_in.scale; - // In order to minimize flooring we convert the offset to a float, - // then compute the new offset in the float domain, - // finally we convert it back as int32_t - offset_to_apply -= static_cast(static_cast(qinfo_in.offset) * qinfo_in.scale / qinfo.scale); - } - - // Create kernel - CLBuildOptions build_opts; - build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT"); - build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply)); - build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply)); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type)); - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(input_width_x - vec_size_x, 0))); - std::pair min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type); - build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first)); - build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second)); - - _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps()); - if(multi_access_x) - { - win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); - } - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); - return Status{}; -} - -void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3); - Window slice = window_collapsed.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.h b/src/core/gpu/cl/kernels/ClQuantizeKernel.h deleted file mode 100644 index 1991a2fba8..0000000000 --- a/src/core/gpu/cl/kernels/ClQuantizeKernel.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_QUANTIZE_KERNEL_H -#define ARM_COMPUTE_CL_QUANTIZE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the quantization layer kernel. - * - * @note The implementation supports only 3D input tensors. - */ -class ClQuantizeKernel : public IClKernel -{ -public: - ClQuantizeKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClQuantizeKernel); - /** Set the input, output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. - * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16. - * - * @note Output auto initialization is not supported by this kernel - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClQuantizeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_QUANTIZE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClReshapeKernel.cpp b/src/core/gpu/cl/kernels/ClReshapeKernel.cpp deleted file mode 100644 index fcda061930..0000000000 --- a/src/core/gpu/cl/kernels/ClReshapeKernel.cpp +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClReshapeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" - -#include - -/** [ClReshapeKernel Kernel] **/ -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - - if(dst->tensor_shape().total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size()); - } - - return Status{}; -} -} // namespace - -ClReshapeKernel::ClReshapeKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClReshapeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - - auto padding_info = get_padding_info({ src, dst }); - - // Create kernel - std::set build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) }; - _kernel = create_kernel(compile_context, "reshape_layer", build_opts); - - // Add static arguments - const cl_int2 src_shape = - { - { - static_cast(src->tensor_shape()[0]), - static_cast(src->tensor_shape()[1]) - } - }; - const cl_int2 dst_shape = - { - { - static_cast(dst->tensor_shape()[0]), - static_cast(dst->tensor_shape()[1]) - } - }; - unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters - _kernel.setArg(idx++, src_shape); - _kernel.setArg(idx++, dst_shape); - - // Configure kernel window - Window win = calculate_max_window(*src); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); - - return Status{}; -} - -void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = window_collapsed.first_slice_window_3D(); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - // Set srcs - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, window_collapsed); - add_3D_tensor_argument(idx, dst, window_collapsed); - enqueue(queue, *this, slice, lws_hint()); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -/** [ClReshapeKernel Kernel] **/ diff --git a/src/core/gpu/cl/kernels/ClReshapeKernel.h b/src/core/gpu/cl/kernels/ClReshapeKernel.h deleted file mode 100644 index 01e1ee84b9..0000000000 --- a/src/core/gpu/cl/kernels/ClReshapeKernel.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_RESHAPE_KERNEL_H -#define ARM_COMPUTE_CL_RESHAPE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the kernel to perform tensor reshaping */ -class ClReshapeKernel : public IClKernel -{ -public: - ClReshapeKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClReshapeKernel); - /** Set the src and dst of the kernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data type supported: All. - * @param[out] dst Destination tensor info. Data type supported: Same as @p src - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClReshapeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace opencl -} // namespace kernels -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.cpp b/src/core/gpu/cl/kernels/ClScaleKernel.cpp deleted file mode 100644 index ee4ee22aa0..0000000000 --- a/src/core/gpu/cl/kernels/ClScaleKernel.cpp +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClScaleKernel.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Cast.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -inline std::pair calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners) -{ - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - // Compute the ratio between source width/height and destination width/height - const unsigned int src_width = src->dimension(idx_width); - const unsigned int src_height = src->dimension(idx_height); - const unsigned int dst_width = dst->dimension(idx_width); - const unsigned int dst_height = dst->dimension(idx_height); - - float scale_x = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners); - float scale_y = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners); - - return std::make_pair(scale_x, scale_y); -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(dst == src); - ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type())); - - float scale_x = 0.f; - float scale_y = 0.f; - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners); - - ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f)); - - return Status{}; -} -} // namespace - -Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, info)); - return Status{}; -} - -ClScaleKernel::ClScaleKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info)); - auto padding_info = get_padding_info({ src, dst }); - - // Info required for the static tuning - _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - - const bool is_nhwc = _data_layout == DataLayout::NHWC; - - float scale_x = 0.f; - float scale_y = 0.f; - std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners); - const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR; - - // Area interpolation behaves as Nearest Neighbour in case of up-sampling - auto interpolation_policy_to_use = info.interpolation_policy; - if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f) - { - interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR; - } - - // Create kernel - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const unsigned int src_width = src->dimension(idx_width); - const unsigned int src_height = src->dimension(idx_height); - const unsigned int dst_width = dst->dimension(idx_width); - const unsigned int vec_size = adjust_vec_size(is_nhwc ? 1 : 4, dst_width); - const unsigned int vec_size_leftover = (dst_width % vec_size); - - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width)); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height)); - build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x)); - build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y)); - - build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); - build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); - build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); - build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover))); - build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2))); - build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); - build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); - if(is_qasymm_bilinear) - { - const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); - build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale)); - build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)); - } - std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use); - std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); - std::string kernel_name = "scale_" + interpolation_name + "_"; - kernel_name += lower_string(string_from_data_layout(_data_layout)); - - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*dst, Steps(vec_size)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); - - // Set config_id for enabling LWS tuning - _config_id = "scale_"; - _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : ""); - _config_id += (info.sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft"); - _config_id += (is_nhwc ? "nhwc" : "nchw"); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(3)); -} - -void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - switch(_data_layout) - { - case DataLayout::NCHW: - { - Window slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, src, slice); - add_2D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); - break; - } - case DataLayout::NHWC: - { - Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ); - Window slice = collapsed.first_slice_window_4D(); - - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, slice); - add_4D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - break; - } - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.h b/src/core/gpu/cl/kernels/ClScaleKernel.h deleted file mode 100644 index 6674931296..0000000000 --- a/src/core/gpu/cl/kernels/ClScaleKernel.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_SCALE_KERNEL_H -#define ARM_COMPUTE_CL_SCALE_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the scale kernel */ -class ClScaleKernel : public IClKernel -{ -public: - ClScaleKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClScaleKernel); - /** Initialise the kernel's inputs, output and interpolation policy - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32 - * @param[out] dst Destination tensor info. Data types supported: Same as @p src - * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] info @ref ScaleKernelInfo Kernel descriptor to be used to configure. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClScaleKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - DataLayout _data_layout{ DataLayout::UNKNOWN }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_SCALE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp deleted file mode 100644 index 1dd905d66e..0000000000 --- a/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp +++ /dev/null @@ -1,365 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClSoftmaxKernel.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/experimental/Types.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options. - * - * Prepares these build options: - * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier. - * -DIFF_MIN - threshold difference between maximum value of input data and current processed value, - * it defines whether the value will be taken into account or not. - * - * @param[in] build_opts Build options to extend - * @param[in] input_scale Input scaling factor - * @param[in] beta Exponent scaling factor beta - */ -CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta) -{ - // Number of integer bits in temporary fixed-point representation of current-to-max difference - static const int scaled_diff_int_bits = 5; - // Number of integer bits used in temporary fixed-point representation of exponent accumulator - static const int exp_accumulation_in_bits = 12; - - const double beta_multiplier = std::min( - 1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), - (1LL << 31) - 1.0); - int input_beta_multiplier; - int input_beta_left_shift; - quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift); - - const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift); - const int diff_min = -1.f * std::floor(max_input_rescaled); - - CLBuildOptions build_opts; - build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits)); - build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits)); - build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier)); - build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift)); - build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min)); - - return build_opts; -} - -Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum) -{ - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); - - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); - - // Checks performed when output is configured - if(dst.total_size() != 0) - { - if(is_quantized_asymmetric) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); - } - - // Checks performed when sum is configured - if(sum.total_size() != 0) - { - if(is_quantized_asymmetric) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&max, &sum); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&max, &sum); - } - - return Status{}; -} - -Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &sum); - ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type)); - - // Note: output should always have a scale of 1/256 and offset 0 - const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log); - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); - - // Checks performed when output is configured - if(dst.total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); - if(!is_quantized_asymmetric) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != allowed_quantization_info); - } - } - - return Status{}; -} -} // namespace - -/**< Grid size (obtained through auto-tuning) */ -const unsigned int ClLogits1DMaxShiftExpSumKernel::_grid_size = 64; -/**< Vector size in the serial case (obtained through auto-tuning) */ -const unsigned int ClLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8; -/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/ -const unsigned int ClLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4; - -ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info) -{ - auto padding_info = get_padding_info({ &src, &max, &dst, &sum }); - - // Output auto initialization if not yet initialized - auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape())); - auto_init_if_empty(dst, *src.clone()); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum)); - - const DataType dt = src.data_type(); - const UniformQuantizationInfo qinfo = src.quantization_info().uniform(); - const size_t reduction_dim_size = src.dimension(0); - const float beta = info.beta; - const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type); - const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0; - - ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size); - const unsigned int vector_size = adjust_vec_size(std::get<1>(parallel_reduction_info), reduction_dim_size); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)); - build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value)); - build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size)); - build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size)); - build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size)))); - build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE"); - build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED"); - build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta)); - build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX"); - build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX"))); - build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options()); - - cl::NDRange lws_hint(cl::NullRange); - std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : ""); - - // Configure parallel kernel if needed - if(std::get<0>(parallel_reduction_info)) - { - kernel_name += "parallel"; - bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0); - build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size)); - - // Handle boundary conditions. - const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size; - build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE"); - // Setting _lws_hint in this way can also communicate grid_size to ClLogits1DMaxShiftExpSumKernel::run(). - // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0]. - lws_hint = cl::NDRange(_grid_size); - } - else - { - kernel_name += "serial"; - } - - // Create kernel. - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure window - Window win = calculate_max_window(src, Steps(reduction_dim_size)); - IClKernel::configure_internal(win, lws_hint); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum)); - return Status{}; -} - -ClLogits1DMaxShiftExpSumKernel::ParallelReductionInfo ClLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size) -{ - bool is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1); - unsigned int vector_size = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size; - return std::make_tuple(is_parallel_reduction, vector_size); -} - -void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - auto max = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_INT_0)); - auto sum = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_INT_1)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, max, sum); - - // Collapse window in Z dimension - Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); - - // Reconfigure window in case of parallel reduction - ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0)); - if(std::get<0>(parallel_reduction_info)) - { - // Launch grid_size parallel work items - window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1)); - } - - // Get slices - Window slice = window_collapsed.first_slice_window_3D(); - do - { - unsigned int idx = 0; - // Set inputs - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, max, slice); - add_3D_tensor_argument(idx, dst, slice); - add_3D_tensor_argument(idx, sum, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); -} - -ClLogits1DNormKernel::ClLogits1DNormKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info) -{ - auto padding_info = get_padding_info({ &src, &dst, &sum }); - - // Note: output should always have a scale of 1/256 and offset 0 - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); - const DataType output_data_type = info.input_data_type; - const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log); - const UniformQuantizationInfo qinfo = src.quantization_info().uniform(); - - // Output auto initialization if not yet initialized - auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info)); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info)); - - const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type); - const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0; - const unsigned int vector_size = adjust_vec_size(16, src.dimension(0)); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type)); - build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value)); - build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); - build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(src.dimension(0) % vector_size)); - build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED"); - build_opts.add_options_if(is_quantized_asymmetric, - prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options()); - build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX"); - - // Create kernel - std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : ""); - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure window - auto win = calculate_max_window(src, Steps(vector_size)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info)); - - return Status{}; -} - -void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - auto sum = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_INT_0)); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, sum); - - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - Window slice = window_collapsed.first_slice_window_3D(); - - do - { - Window sum_slice = slice; - sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1)); - - unsigned int idx = 0; - // Set inputs - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, sum, sum_slice); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClSoftmaxKernel.h b/src/core/gpu/cl/kernels/ClSoftmaxKernel.h deleted file mode 100644 index a2ad02d6b7..0000000000 --- a/src/core/gpu/cl/kernels/ClSoftmaxKernel.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_SOFTMAX_KERNEL_H -#define ARM_COMPUTE_CL_SOFTMAX_KERNEL_H - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for max, shifting, exponentiating and summing the logits */ -class ClLogits1DMaxShiftExpSumKernel : public IClKernel -{ - /**< Grid size (obtained through auto-tuning) */ - static const unsigned int _grid_size; - /**< Vector size in the serial case (obtained through auto-tuning) */ - static const unsigned int _serial_vector_size; - /**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/ - static const unsigned int _parallel_vector_size; - -public: - /** Info for whether a parallel reduction will be run and the vector size of the execution. */ - using ParallelReductionInfo = std::tuple; - - ClLogits1DMaxShiftExpSumKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DMaxShiftExpSumKernel); - /** Configure the kernel using the given information about tensors - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[in,out] max Max values tensor. Data types supported: same as @p src - * @param[out] dst Destination tensor. Data types supported: same as @p src - * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p src - * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum); - /** Checks if the given size is eligible for parallel reduction - * - * @note Serial reduction is launched for width < (_grid_size * _serial_vector_size). - * @note Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4. - * - * @param[in] size Size to check - * - * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run, - * while the second element is the vector size of the execution. - */ - static ParallelReductionInfo is_parallel_reduction(size_t size); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; -}; - -/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */ -class ClLogits1DNormKernel : public IClKernel -{ -public: - ClLogits1DNormKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DNormKernel); - - /** Set the input and output tensors. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported. - * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input - * @param[out] dst Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input - * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClLogits1DNormKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_SOFTMAX_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClTransposeKernel.cpp b/src/core/gpu/cl/kernels/ClTransposeKernel.cpp deleted file mode 100644 index 40bd4b034a..0000000000 --- a/src/core/gpu/cl/kernels/ClTransposeKernel.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClTransposeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -ClTransposeKernel::ClTransposeKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClTransposeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output auto initialization if not yet initialized - const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); - - ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst)); - auto padding_info = get_padding_info({ src, dst }); - - // Create kernel - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); - const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; - const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1)); - const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; - - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size())); - build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers)); - build_opts.add_option("-DVEC_SIZE_Y=" + support::cpp11::to_string(vec_size_y)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER_Y=" + support::cpp11::to_string(vec_size_y_leftovers)); - - _kernel = create_kernel(compile_context, "transpose", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y)); - ICLKernel::configure_internal(win, cl::NDRange(2, 8)); - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported"); - - // Validate configured dst - if(dst->total_size() != 0) - { - const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - return Status{}; -} - -void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, src, slice); - add_2D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClTransposeKernel.h b/src/core/gpu/cl/kernels/ClTransposeKernel.h deleted file mode 100644 index c8379d44c7..0000000000 --- a/src/core/gpu/cl/kernels/ClTransposeKernel.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H -#define ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to transpose a 2D tensor. */ -class ClTransposeKernel : public IClKernel -{ -public: - ClTransposeKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClTransposeKernel); - /** Set the src and dst of the kernel. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The src tensor info. Data types supported: All. - * @param[in] dst The dst tensor info. Data types supported: Same as @p src - */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClTransposeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp deleted file mode 100644 index e3629f7706..0000000000 --- a/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -using namespace misc::shape_calculator; -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1); - ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0); - - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1)); - ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2)); - ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3])); - ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4])); - } - - // Checks performed when output is configured - if(output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - } - - return Status{}; -} -} // namespace - -ClWeightsReshapeKernel::ClWeightsReshapeKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups))); - - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups)); - auto padding_info = get_padding_info({ src, biases, dst }); - - const DataType data_type = src->data_type(); - - // Create build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(data_type))); - build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups)); - build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS"); - - // Create kernel - _kernel = create_kernel(compile_context, "reshape_to_columns", build_opts.options()); - - // Configure window - Window win = calculate_max_window(*src, Steps()); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups)); - return Status{}; -} - -void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto biases = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window out_window; - out_window.use_tensor_dimensions(dst->info()->tensor_shape()); - - Window in_slice = window.first_slice_window_3D(); - Window out_slice = out_window.first_slice_window_2D(); - - Window biases_window; - Window biases_slice; - - unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor(); - idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0; - _kernel.setArg(idx++, src->info()->dimension(0)); - _kernel.setArg(idx++, src->info()->dimension(1)); - _kernel.setArg(idx++, src->info()->dimension(2)); - _kernel.setArg(idx++, src->info()->dimension(3)); - _kernel.setArg(idx++, dst->info()->strides_in_bytes().z()); - - if(biases != nullptr) - { - biases_window.use_tensor_dimensions(biases->info()->tensor_shape()); - biases_slice = biases_window.first_slice_window_1D(); - } - - do - { - // Set arguments - unsigned idx = 0; - add_3D_tensor_argument(idx, src, in_slice); - add_2D_tensor_argument(idx, dst, out_slice); - if(biases != nullptr) - { - add_1D_tensor_argument(idx, biases, biases_slice); - ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice)); - } - - // Run kernel - enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h deleted file mode 100644 index de2f2d10cc..0000000000 --- a/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H -#define ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer - * - * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels. - * In combination with the @ref opencl::kernels::ClIm2ColKernel can transform a convolution to a matrix multiplication. - * - * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have: - * @f[ - * \left( \begin{array}{ccc} - * a000 & a001 & a002 \\ - * a010 & a011 & a012 \\ - * a020 & a021 & a022 \\ - * \end{array} \right) - * \left( \begin{array}{ccc} - * a100 & a101 & a102 \\ - * a110 & a111 & a112 \\ - * a120 & a121 & a122 \\ - * \end{array} \right) - * \rightarrow - * \left( \begin{array}{ccccccccc} - * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\ - * \end{array} \right) - * @f] - */ -class ClWeightsReshapeKernel : public IClKernel -{ -public: - ClWeightsReshapeKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWeightsReshapeKernel); - /** Set the input and output of the kernel. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: All - * @param[in] biases The shared biases tensor info to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with - * dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr. - * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. - * @param[out] dst The output tensor info. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise. - * Data types supported: Same as @p input - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout - * Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it. - */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups = 1); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClWeightsReshapeKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp deleted file mode 100644 index 8607620e92..0000000000 --- a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/helpers/tensor_info.h" -#include "support/Cast.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0)); - - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i)); - ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i)); - } - ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4); - - return Status{}; -} -} // namespace - -Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst)); - return Status{}; -} - -ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst)); - - auto padding_info = get_padding_info({ src1, src2, dst }); - - const unsigned int min_dimension = std::min(src1->dimension(0), src2->dimension(0)); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension); - const unsigned int vec_size_leftover = dst->dimension(0) % num_elems_processed_per_iteration; - - // Add build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0))); - build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0))); - build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size())); - build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); - - // If input have different quantization info set quantization parameters needed for the re-quantization process - const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2); - if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) - { - const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); - build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset)); - build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale)); - build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); - } - - // Create kernel - _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win.collapse(win, Window::DimZ)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); - - // Set config_id for enabling LWS tuning - _config_id = "concatenate_width_x2_"; - _config_id += lower_string(string_from_data_type(src1->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(1)); -} - -void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_4D(); - - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, src0, slice); - add_4D_tensor_argument(idx, src1, slice); - add_4D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, window, lws_hint()); - } - while(window.slide_window_slice_4D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h deleted file mode 100644 index 15e0757aec..0000000000 --- a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H -#define ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the width concatenate kernel of 2 tensors. - * The src1 and src2 tensors will be concatenated into the dst tensor. - */ -class ClWidthConcatenate2TensorsKernel : public IClKernel -{ -public: - ClWidthConcatenate2TensorsKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate2TensorsKernel); - /** Initialise the kernel's sources and destination - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 First source tensor info. Data types supported: All. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1 - * @param[out] dst Destination tensor info. Data types supported: Same as @p src1. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClWidthConcatenate2TensorsKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_2TENSORS_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp deleted file mode 100644 index edbc23c1d3..0000000000 --- a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/helpers/tensor_info.h" -#include "support/Cast.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0)); - - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i)); - ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i)); - ARM_COMPUTE_RETURN_ERROR_ON(src3->dimension(i) != dst->dimension(i)); - ARM_COMPUTE_RETURN_ERROR_ON(src4->dimension(i) != dst->dimension(i)); - } - ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4); - - return Status{}; -} -} // namespace - -ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst)); - return Status{}; -} - -void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context, - ITensorInfo *src1, ITensorInfo *src2, - ITensorInfo *src3, ITensorInfo *src4, - ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst)); - - auto padding_info = get_padding_info({ src1, src2, src3, src4, dst }); - const unsigned int min_dimension = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0))); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension); - const unsigned int vec_size_leftover = dst->dimension(0) % num_elems_processed_per_iteration; - - // Add build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2))); - build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0))); - build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0))); - build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(src3->dimension(0))); - build_opts.add_option("-DINPUT4_WIDTH=" + support::cpp11::to_string(src4->dimension(0))); - build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size())); - build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); - build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); - build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); - - // If soources have different quantization info set quantization parameters needed for the re-quantization process - const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4); - if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) - { - const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); - const UniformQuantizationInfo iq3_info = src3->quantization_info().uniform(); - const UniformQuantizationInfo iq4_info = src4->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); - build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset)); - build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale)); - build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(iq3_info.offset)); - build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(iq3_info.scale)); - build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(iq4_info.offset)); - build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(iq4_info.scale)); - build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); - } - - // Create kernel - _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options()); - - // Configure kernel window - Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win.collapse(win, Window::DimZ)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); - - // Set config_id for enabling LWS tuning - _config_id = "concatenate_width_x4_"; - _config_id += lower_string(string_from_data_type(src1->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src3->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src3->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src4->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src4->dimension(1)); -} - -void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); - const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); - const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2)); - const auto src3 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - Window slice = window.first_slice_window_4D(); - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, src0, slice); - add_4D_tensor_argument(idx, src1, slice); - add_4D_tensor_argument(idx, src2, slice); - add_4D_tensor_argument(idx, src3, slice); - add_4D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, window, lws_hint()); - } - while(window.slide_window_slice_4D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h deleted file mode 100644 index 1e3f47f7fb..0000000000 --- a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H -#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the width concatenate kernel of 4 tensors. - * All source tensors will be concatenated into the destination tensor. - */ -class ClWidthConcatenate4TensorsKernel : public IClKernel -{ -public: - ClWidthConcatenate4TensorsKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate4TensorsKernel); - /** Initialise the kernel's sources and destination - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 First source tensor info. Data types supported: All. - * @param[in] src2 Second source tensor info. Data types supported: same as @p src1 - * @param[in] src3 Third source tensor info. Data types supported: same as @p src1 - * @param[in] src4 Fourth source tensor info. Data types supported: same as @p src1 - * @param[out] dst Destination tensor info. Data types supported: same as @p src1. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClWidthConcatenate4TensorsKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp deleted file mode 100644 index 5510c746f8..0000000000 --- a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" - -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0)); - - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); - } - ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4); - - return Status{}; -} -} // namespace - -ClWidthConcatenateKernel::ClWidthConcatenateKernel() -{ - _type = CLKernelType::ELEMENTWISE; -} - -Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst)); - return Status{}; -} - -void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst)); - - auto padding_info = get_padding_info({ src, dst }); - - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0)); - - // Add build options - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset)); - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2))); - - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) - { - const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - - build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset)); - build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iqinfo.scale)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale)); - } - - // Create kernel - _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options()); - // Configure kernel window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win.collapse(win, Window::DimZ)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, window); - add_4D_tensor_argument(idx, dst, window); - enqueue(queue, *this, window, lws_hint()); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h deleted file mode 100644 index 300c4beb30..0000000000 --- a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H -#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the width concatenate kernel. - * The source tensor will be concatenated into the destination tensor. - */ -class ClWidthConcatenateKernel : public IClKernel -{ -public: - ClWidthConcatenateKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenateKernel); - /** Initialise the kernel's source and destination - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. Data types supported: All. - * @param[in] width_offset The offset on the X axis. - * @param[in,out] dst Destination tensor info. Data types supported: same as @p src. - * - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClWidthConcatenateKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp deleted file mode 100644 index ae43fed12d..0000000000 --- a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -using namespace arm_compute::misc::shape_calculator; - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - - const Size2D kernel_size = winograd_info.kernel_size; - const Size2D output_tile_size = winograd_info.output_tile_size; - - const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported"); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - - // Checks performed when output is configured - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info)); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_UNUSED(output); - - const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1; - const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); - const unsigned int num_elems_read_per_iteration_z = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2); - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z)); - Window win_collapsed = win.collapse(win, Window::DimZ); - return std::make_pair(Status{}, win_collapsed); -} -} // namespace - -ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel() -{ - _type = CLKernelType::WINOGRAD; -} - -void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info))); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info)); - auto padding_info = get_padding_info({ src, dst }); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(src->dimension(2))); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL"); - build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL"); - const Size2D kernel_size = winograd_info.kernel_size; - const Size2D output_tile_size = winograd_info.output_tile_size; - - // Create kernel - std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout())); - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - IClKernel::configure_internal(win_config.second); - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); - - return Status{}; -} - -void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - // Setup output window - Window window_out; - window_out.use_tensor_dimensions(dst->info()->tensor_shape(), 0); - - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, window); - add_3D_tensor_argument(idx, dst, window_out); - enqueue(queue, *this, window, lws_hint()); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h deleted file mode 100644 index 145954fbb1..0000000000 --- a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H -#define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the Winograd filter transform kernel. */ -class ClWinogradFilterTransformKernel : public IClKernel -{ -public: - ClWinogradFilterTransformKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradFilterTransformKernel); - /** Set the input and output tensor. - * - * @note Winograd filter transform supports the following configurations for NCWH data layout - * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3), - * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), - * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) - * - * @note Winograd filter transform supports the following configurations for NHWC data layout - * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), - * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) - * - * Strides: only unit strides - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32. - * @param[out] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input - * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClWinogradFilterTransformKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp deleted file mode 100644 index 538d8ae602..0000000000 --- a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - - const PadStrideInfo conv_info = winograd_info.convolution_info; - const Size2D output_tile_size = winograd_info.output_tile_size; - const Size2D kernel_size = winograd_info.kernel_size; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported"); - - ARM_COMPUTE_UNUSED(conv_info); - ARM_COMPUTE_UNUSED(output_tile_size); - ARM_COMPUTE_UNUSED(kernel_size); - - // Validate configured output - if(output->total_size() != 0) - { - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_UNUSED(output); - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - bool window_changed = false; - Window win = calculate_max_window(*input, Steps(1, 1)); - - if(input->data_layout() == DataLayout::NCHW) - { - const PadStrideInfo conv_info = winograd_info.convolution_info; - const Size2D output_tile_size = winograd_info.output_tile_size; - const Size2D kernel_size = winograd_info.kernel_size; - - unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1; - unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1; - - AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y); - window_changed = update_window_and_padding(win, input_access); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} -} // namespace - -ClWinogradInputTransformKernel::ClWinogradInputTransformKernel() -{ - _type = CLKernelType::WINOGRAD; -} - -BorderSize ClWinogradInputTransformKernel::border_size() const -{ - return _border_size; -} - -void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info)); - - auto padding_info = get_padding_info({ src, dst }); - - const PadStrideInfo conv_info = winograd_info.convolution_info; - const Size2D output_tile_size = winograd_info.output_tile_size; - const Size2D kernel_size = winograd_info.kernel_size; - - _data_layout = src->data_layout(); - - const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - // Compute the number of output tiles along the x and y direction of size "output_tile_size" - const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)), - kernel_size, - output_tile_size, - conv_info); - - _num_tiles_x = num_tiles.width; - _num_tiles_y = num_tiles.height; - - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape)); - - ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast(dst->dimension(1))); - const size_t total_batches = src->tensor_shape().total_size_upper(3); - - CLBuildOptions build_opts; - if(_data_layout == DataLayout::NHWC) - { - build_opts.add_option("-DNHWC"); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_w))); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_h))); - build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x)); - build_opts.add_option("-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y)); - build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); - build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); - build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); - build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL"); - build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL"); - } - else - { - build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x)); - build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); - build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); - build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); - build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL"); - build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL"); - build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); - } - - // Create kernel - std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); - - // Get the maximum dimension from the tile size - const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height); - - // Check optimized kernel if output_dims == 2x2 - if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW)) - { - _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2; - } - - // Append stepz and data layout - kernel_name += "_stepz"; - kernel_name += support::cpp11::to_string(_step_z); - kernel_name += "_" + lower_string(string_from_data_layout(_data_layout)); - - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Create window and update padding - auto win_config = validate_and_configure_window(src, dst, winograd_info); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8)); - - _border_size = BorderSize(src->padding()); - - ARM_COMPUTE_ERROR_ON((src->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info)); - - _config_id = kernel_name; - _config_id += support::cpp11::to_string(src->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(conv_info.pad_left()); - _config_id += "_"; - _config_id += support::cpp11::to_string(conv_info.pad_top()); - _config_id += "_"; - _config_id += lower_string(string_from_data_layout(_data_layout)); -} - -Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first); - return Status{}; -} - -void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const size_t total_batches = window.shape().total_size_upper(3); - - // Collapse window - Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); - - if(_data_layout == DataLayout::NHWC) - { - Window slice = window_collapsed.first_slice_window_3D(); - slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1)); - slice.set(2, Window::Dimension(0, total_batches, 1)); - - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, slice); - add_4D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - } - else - { - Window slice = window_collapsed.first_slice_window_3D(); - slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1)); - slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1)); - - ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0); - slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z)); - - unsigned int idx = 2 * num_arguments_per_3D_tensor(); - _kernel.setArg(idx++, static_cast(src->info()->strides_in_bytes()[3])); - _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[3])); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src, slice); - add_3D_tensor_argument(idx, dst, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); - } -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h deleted file mode 100644 index 40fc2f387a..0000000000 --- a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H -#define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** OpenCL kernel to perform Winograd input transform.*/ -class ClWinogradInputTransformKernel : public IClKernel -{ -public: - ClWinogradInputTransformKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradInputTransformKernel); - /** Set the input and output of the kernel. - * - * @note Winograd input transform supports the following configurations for NCWH data layout - * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3), - * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), - * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) - * - * @note Winograd input transform supports the following configurations for NHWC data layout - * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), - * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) - * - * Strides: only unit strides - * - * @param[in] compile_context The compile context to be used. - * @param[in] src The input tensor info to transform. Data types supported: F16/F32 - * @param[in] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input - * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClWinogradInputTransformKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - using WinogradKey = std::pair, std::pair>; - - BorderSize _border_size{ 0 }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - int _num_tiles_x{ 0 }; - int _num_tiles_y{ 0 }; - unsigned int _step_z{ 1 }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp deleted file mode 100644 index f6ade57e5d..0000000000 --- a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -#include - -using namespace arm_compute::misc::shape_calculator; - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - - ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout); - - const PadStrideInfo conv_info = winograd_info.convolution_info; - const Size2D output_tile_size = winograd_info.output_tile_size; - const Size2D kernel_size = winograd_info.kernel_size; - const Size2D input_dimensions = winograd_info.input_dimensions; - const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels"); - - // Compute number of elements to process in the X and Y direction - // Compute the number of output tiles along the x and y direction of size "output_tile_size" - const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, - kernel_size, - output_tile_size, - conv_info); - - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast((num_tiles.area()))); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); - } - - // Checks performed when output is configured - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info)); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} - -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_UNUSED(bias); - - constexpr unsigned int num_elems_processed_per_iteration = 1; - - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - bool window_changed = false; - - if(output->data_layout() == DataLayout::NCHW) - { - const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width); - const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height); - - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); - AccessWindowStatic output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y); - window_changed = update_window_and_padding(win, input_access, output_access); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} -} // namespace - -ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel() -{ - _type = CLKernelType::WINOGRAD; -} - -void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info))); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info)); - - // Configure kernel window - auto win_config = validate_and_configure_window(src, bias, dst, winograd_info.output_tile_size); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - IClKernel::configure_internal(win_config.second); - - auto padding_info = get_padding_info({ src, bias, dst }); - - _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC; - - // Compute num_tiles_x - const Size2D input_dimensions = winograd_info.input_dimensions; - const Size2D kernel_size = winograd_info.kernel_size; - const Size2D output_tile_size = winograd_info.output_tile_size; - const PadStrideInfo conv_info = winograd_info.convolution_info; - const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT); - - // Compute the number of output tiles along the x and y direction of size "output_tile_size" - const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, - kernel_size, - output_tile_size, - conv_info); - const size_t total_batches = dst->tensor_shape().total_size_upper(3); - - // Set build options - CLBuildOptions build_opts; - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); - build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); - build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); - - if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2)) - { - build_opts.add_option("-DVEC_SIZE=2"); - } - else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4)) - { - build_opts.add_option("-DVEC_SIZE=4"); - } - - build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS")); - build_opts.add_option("-cl-fast-relaxed-math"); - build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); - build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width)); - build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); - build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1))); - build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(idx_width))); - build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); - build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); - build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL"); - build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL"); - - // Create kernel - std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout)); - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(src->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout)); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc); -} - -Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first); - return Status{}; -} - -void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); - - auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - auto bias = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - // Collapse window - Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); - - // Get initial windows - Window slice = window_collapsed.first_slice_window_4D(); - slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - // Setup output slice - Window slice_out(slice); - slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - - if(bias != nullptr) - { - unsigned int idx1 = 2 * num_arguments_per_4D_tensor(); - Window slice_biases; - slice_biases.use_tensor_dimensions(bias->info()->tensor_shape()); - add_1D_tensor_argument(idx1, bias, slice_biases); - } - - if(_is_nhwc) - { - unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0); - _kernel.setArg(idx2, static_cast(dst->info()->total_size() - dst->info()->strides_in_bytes().y())); - } - - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, src, slice); - add_4D_tensor_argument(idx, dst, slice_out); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h deleted file mode 100644 index 22b7f079c1..0000000000 --- a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H -#define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the Winograd output transform kernel. */ -class ClWinogradOutputTransformKernel : public IClKernel -{ -public: - ClWinogradOutputTransformKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradOutputTransformKernel); - /** Set the input and output tensor. - * - * @note Winograd output transform supports the following configurations for NCWH data layout - * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3), - * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), - * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) - * - * @note Winograd output transform supports the following configurations for NHWC data layout - * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), - * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) - * - * Strides: only unit strides - * - * @param[in] compile_context The compile context to be used. - * @param[in] src Source tensor info with shape [C, N, K, batches]. Data types supported: F16/F32. - * @param[in] bias Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p src - * @param[out] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p src - * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to ClWinogradOutputTransformKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - using WinogradKey = std::pair, std::pair>; - - bool _is_nhwc{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp deleted file mode 100644 index 7866ccb679..0000000000 --- a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -std::pair configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, - bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image) -{ - ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0); - v0 = std::max(std::min(static_cast(m / m0), static_cast(v0)), static_cast(1)); - h0 = std::max(std::min(static_cast(n / n0), static_cast(h0)), static_cast(1)); - - const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave); - const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image); - - return std::make_pair(lhs_info, rhs_info); -} - -std::pair select_lhs_rhs_info(std::pair info_img, - std::pair info_buf, - unsigned int n, unsigned int k, unsigned int b, DataType data_type) -{ - const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, data_type); - const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second); - const TensorInfo tensor_reshaped_info(shape, 1, data_type); - - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second))) - { - return info_img; - } - else - { - return info_buf; - } -} - -void update_padding_for_cl_image(ITensorInfo *tensor) -{ - constexpr unsigned int num_floats_per_pixel = 4; - - const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size(); - const unsigned int pixel_alignment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()); - - ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment"); - if(pixel_alignment == 0) - { - return; - } - - const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel; - const unsigned int round_up_width = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; - const unsigned int padding = round_up_width - stride_y_in_elements; - - tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0)); -} - -Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info) -{ - if(rhs_info.export_to_cl_image) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); - - // Check the width and height of the output tensor. - // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension - const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); - const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image"); - } - - return Status{}; -} -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h deleted file mode 100644 index 3fce8c9173..0000000000 --- a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_HELPERS_H -#define ARM_COMPUTE_CL_GEMM_HELPERS_H - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo - * - * @param[in] m Number of rows (M) in the LHS matrix not reshaped - * @param[in] n Number of columns (N) in the RHS matrix not reshaped - * @param[in] m0 Number of rows processed by each thread/work-item - * @param[in] n0 Number of columns processed by each thread/work-item - * @param[in] k0 Number of inner accumulation performed by each thread/work-item - * @param[in] v0 Number of vertical blocks of size (m0xk0) stored on the same output row - * @param[in] h0 Number of horizontal blocks of size (k0xn0) stored on the same output row - * @param[in] lhs_interleave True if the v0 (m0xk0) blocks have to be interleaved in the output row - * @param[in] rhs_interleave True if the h0 (k0xn0) blocks have to be interleaved in the output row - * @param[in] lhs_transpose True if the (m0xk0) block has to be transposed before been stored - * @param[in] rhs_transpose True if the (k0xn0) block has to be transposed before been stored - * @param[in] export_to_cl_image (Optional) True if the RHS reshaped matrix has to be exported to cl_image - * - * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo - */ -std::pair configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, - bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false); - -/** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo - * - * This function accepts two pairs of GEMMLHSMatrixInfo/GEMMRHSMatrixInfo where only the first is with cl_image2d support, - * and selects the valid one validating the GEMMRHSMatrixInfo. If the validation passes, the functions will return - * the first GEMMLHSMatrixInfo/GEMMRHSMatrixInfo pair with cl_image2d support. - * - * @param[in] info_img GEMMLHSMatrixInfo/GEMMRHSMatrixInfo with cl_image2d support - * @param[in] info_buf GEMMLHSMatrixInfo/GEMMRHSMatrixInfo to fall-back if cl_image2d cannot be used - * @param[in] n Number of columns (N) in the RHS matrix not reshaped - * @param[in] k Number of rows (K) in the RHS matrix not reshaped - * @param[in] b Batch size - * @param[in] data_type Data type - * - * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo - */ -std::pair select_lhs_rhs_info(std::pair info_img, - std::pair info_buf, - unsigned int n, unsigned int k, unsigned int b, DataType data_type); - -/** Update padding required to export the OpenCL buffer to OpenCL image2d - * - * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d - */ -void update_padding_for_cl_image(ITensorInfo *tensor); - -/** Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix - * - * @param[in] tensor_reshaped_info TensorInfo for the RHS reshaped matrix - * @param[in] rhs_info @ref GEMMRHSMatrixInfo - * - * @return Status reporting if we can use the image2d OpenCL object on the RHS reshaped matrix - */ -Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info); -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_HELPERS_H */ diff --git a/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h deleted file mode 100644 index a49836cfda..0000000000 --- a/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H -#define ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H - -#include "arm_compute/core/GPUTarget.h" -#include "arm_compute/core/Types.h" -#include "src/core/common/Macros.h" - -#include -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** Basic container for the OpenCL GEMM configuration functions */ -template -class CLGEMMConfigArray -{ -public: - /** Alias for F32 index */ - static constexpr size_t DT_F32 = 0; - /** Alias for F16 index */ - static constexpr size_t DT_F16 = 1; - /** Alias for Int8 index */ - static constexpr size_t DT_INT8 = 2; - - /** Constructor - * - * @param[in] func_f32 Function to call for GEMM F32 - * @param[in] func_f16 Function to call for GEMM F16 - * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) - * - */ - CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } - { - } - - /** Method to return the GEMM configuration function based on data type - * - * @param[in] data_type Input data type - * - * @return the valid function otherwise it returns nullptr if the data type is not valid - */ - T get_function(DataType data_type) - { - switch(data_type) - { - case DataType::F32: - return _configs.at(DT_F32); - case DataType::F16: - return _configs.at(DT_F16); - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - case DataType::QSYMM8_PER_CHANNEL: - return _configs.at(DT_INT8); - default: - return nullptr; - } - } - -private: - std::array _configs; -}; - -/** Basic interface for the GEMM kernel configuration */ -class IClGemmKernelConfig -{ -public: - /** Constructor - * - * @param[in] arch GPU target - */ - IClGemmKernelConfig(GPUTarget arch) - : _target(arch) - { - } - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig); - /** Virtual destructor */ - virtual ~IClGemmKernelConfig() = default; - /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used - * - * @param[in] m Number of rows LHS matrix - * @param[in] n Number of columns RHS matrix - * @param[in] k Number of columns LHS matrix or number of rows RHS matrix - * @param[in] b Batch size - * @param[in] data_type Data type - */ - virtual std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0; - -protected: - GPUTarget _target; -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H */ diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp deleted file mode 100644 index 9d11006703..0000000000 --- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/GPUTarget.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) - : IClGemmKernelConfig(gpu) -{ -} - -std::pair ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) -{ - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); - - CLGEMMConfigArray configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32, - &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_G71_u8); - - CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32, - &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_G76_u8); - - CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32, - &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_default_u8); - - ConfigurationFunctionExecutorPtr func = nullptr; - - switch(_target) - { - case GPUTarget::G76: - func = configs_G76.get_function(data_type); - break; - case GPUTarget::G71: - func = configs_G71.get_function(data_type); - break; - default: - func = configs_G7x.get_function(data_type); - break; - } - - ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); - return (this->*func)(m, n, k, b); -} - -std::pair ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - if(n < 2048) - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); - } - else if(n >= 2048 && n < 8192) - { - return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false); - } - } - else - { - return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false); - } -} - -std::pair ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(dot8_supported(CLKernelLibrary::get().get_device())) - { - if(m == 1) - { - if(n < 2048) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); - } - else if(n >= 2048 && n < 16384) - { - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); - } - } - else - { - if(m < 64) - { - return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); - } - } - } - else - { - if(m == 1) - { - if(n < 8192) - { - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); - } - } - else - { - return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false); - } - } -} - -std::pair ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - if(n > 4196) - { - return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false); - } - else - { - if(k < 2048) - { - return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false); - } - else if(k >= 2048 && k < 16384) - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 1, false, false, false, false); - } - } - } - else - { - return configure_lhs_rhs_info(m, n, 2, 8, 2, 1, 1, false, false, false, false); - } -} - -std::pair ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - if(n < 2048) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); - } - else if(n >= 2048 && n < 16384) - { - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); - } - } - else - { - if(m < 64) - { - return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); - } - } -} - -std::pair ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false); -} - -std::pair ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); -} -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h deleted file mode 100644 index 385b96e40e..0000000000 --- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H -#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** Bifrost based OpenCL GEMMNative configuration */ -class ClGemmDefaultConfigNativeBifrost final : public IClGemmKernelConfig -{ -public: - /** Constructor - * - * @param[in] gpu GPU target - */ - ClGemmDefaultConfigNativeBifrost(GPUTarget gpu); - - // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; - -private: - std::pair configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H */ diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp deleted file mode 100644 index e3c129e3be..0000000000 --- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/GPUTarget.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) - : IClGemmKernelConfig(gpu) -{ -} - -std::pair ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) -{ - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); - - CLGEMMConfigArray configs_default(nullptr, - nullptr, - &ClGemmDefaultConfigNativeMidgard::default_q8); - - auto func = configs_default.get_function(data_type); - ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); - return (this->*func)(m, n, k, b); -} - -std::pair ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - const unsigned int m0 = std::min(m, static_cast(4)); - const unsigned int n0 = std::min(n, static_cast(4)); - - return configure_lhs_rhs_info(m, n, m0, n0, 2, 1, 1, false, false, false, false); -} -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h deleted file mode 100644 index 0ff5471f7c..0000000000 --- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H -#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** Midgard based OpenCL GEMMNative configuration */ -class ClGemmDefaultConfigNativeMidgard final : public IClGemmKernelConfig -{ -public: - /** Constructor - * - * @param[in] gpu GPU target - */ - ClGemmDefaultConfigNativeMidgard(GPUTarget gpu); - - // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; - -private: - std::pair default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H */ diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp deleted file mode 100644 index 92767aca52..0000000000 --- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/GPUTarget.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) - : IClGemmKernelConfig(gpu) -{ -} - -std::pair ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) -{ - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); - - CLGEMMConfigArray configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32, - &ClGemmDefaultConfigNativeValhall::configure_G77_f16, - &ClGemmDefaultConfigNativeValhall::configure_G77_u8); - - auto func = configs_default.get_function(data_type); - ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); - return (this->*func)(m, n, k, b); -} - -std::pair ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - if(n < 2048) - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); - } - else if(n >= 2048 && n < 8192) - { - return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false); - } - } - else - { - return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false); - } -} - -std::pair ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - if(n < 2048) - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); - } - else if(n >= 2048 && n < 8192) - { - return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false); - } - } - else - { - return configure_lhs_rhs_info(m, n, 4, 8, 2, 1, 1, false, false, false, false); - } -} - -std::pair ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(dot8_supported(CLKernelLibrary::get().get_device())) - { - if(m == 1) - { - if(n < 2048) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); - } - else if(n >= 2048 && n < 16384) - { - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); - } - } - else - { - if(m < 64) - { - return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); - } - } - } - else - { - if(m == 1) - { - if(n < 8192) - { - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); - } - } - else - { - return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false); - } - } -} -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h deleted file mode 100644 index 17e4c9d339..0000000000 --- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H -#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** Valhall based OpenCL GEMMNative configuration */ -class ClGemmDefaultConfigNativeValhall final : public IClGemmKernelConfig -{ -public: - /** Constructor - * - * @param[in] gpu GPU target - */ - ClGemmDefaultConfigNativeValhall(GPUTarget gpu); - - // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; - -private: - std::pair configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H */ diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h deleted file mode 100644 index ff6a0128af..0000000000 --- a/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H -#define ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" -#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h" -#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h" -#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** CLGEMMNative factory class */ -class ClGemmNativeKernelConfigurationFactory final -{ -public: - /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target - * - * @param[in] gpu GPU target - * - * @return CLGEMMNative kernel configuration class - */ - static std::unique_ptr create(GPUTarget gpu) - { - switch(get_arch_from_target(gpu)) - { - case GPUTarget::MIDGARD: - return std::make_unique(gpu); - case GPUTarget::BIFROST: - return std::make_unique(gpu); - case GPUTarget::VALHALL: - return std::make_unique(gpu); - default: - ARM_COMPUTE_ERROR("Not supported GPU target"); - } - } -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /*ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp deleted file mode 100644 index b030913a87..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/GPUTarget.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/TensorShape.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -using namespace arm_compute::misc::shape_calculator; - -ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) - : IClGemmKernelConfig(gpu) -{ -} - -std::pair ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) -{ - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - - CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); - - CLGEMMConfigArray configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); - - CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8); - - ConfigurationFunctionExecutorPtr func = nullptr; - - switch(_target) - { - case GPUTarget::G76: - func = configs_G76.get_function(data_type); - break; - case GPUTarget::G52: - func = configs_G52.get_function(data_type); - break; - default: - func = configs_G7x.get_function(data_type); - break; - } - - ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); - return (this->*func)(m, n, k, b); -} - -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(n <= 4) - { - return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); - } - else - { - return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true); - } -} - -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(n <= 4) - { - return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false); - } -} - -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(dot8_supported(CLKernelLibrary::get().get_device())) - { - if(n <= 4) - { - return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true); - } - } - else - { - if(n <= 4) - { - return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true); - } - else - { - return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true); - } - } -} - -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - const float r_mn = static_cast(m) / static_cast(n); - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - const float r_mk = static_cast(m) / static_cast(k); - const float r_nk = static_cast(n) / static_cast(k); - - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - - if(workload <= 274.4000f) - { - if(r_nk <= 0.7461f) - { - if(r_mn <= 21.1667f) - { - return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } - else - { - if(r_mk <= 17.3926f) - { - if(workload <= 542.4000f) - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } - else - { - if(r_nk <= 0.5463f) - { - if(workload <= 11767.6001f) - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } - } -} - -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - - if(workload <= 323.4000f) - { - return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 8, 4, 2, 2, true, true, true, false, false); - } -} - -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - - // Get lhs_info/rhs_info in case of OpenCL buffer - if(n <= 4) - { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); - } - else - { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true); - } - - // Get lhs_info/rhs_info in case of OpenCL image - // Condition on the GPU workload - if((m / 4) * (n / 4) >= 2560) - { - // Big workload - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true); - } - else - { - // Small workload - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true); - } - - const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); - const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img); - const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32); - - // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d - const bool use_cl_image2d = (n <= 4) ? false : true; - - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) - { - return std::make_pair(lhs_info_img, rhs_info_img); - } - else - { - return std::make_pair(lhs_info_buf, rhs_info_buf); - } -} - -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - const float r_mk = static_cast(m) / static_cast(k); - - if(workload <= 1595.2000f) - { - if(r_mk <= 2.1044f) - { - if(workload <= 870.4000f) - { - return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false); - } - } - else - { - return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false); - } - } - else - { - return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false, false); - } -} - -std::pair ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(n <= 4) - { - return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true); - } -} -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h deleted file mode 100644 index 52e6ce3f48..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H -#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** Bifrost based OpenCL GEMMReshaped configuration */ -class ClGemmDefaultConfigReshapedBifrost final : public IClGemmKernelConfig -{ -public: - /** Constructor - * - * @param[in] gpu GPU target - */ - ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu); - - // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; - -private: - std::pair configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp deleted file mode 100644 index 57e42c92b3..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp +++ /dev/null @@ -1,538 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/GPUTarget.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) - : IClGemmKernelConfig(gpu) -{ -} - -std::pair ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) -{ - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - - CLGEMMConfigArray configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedValhall::configure_G77_f16, - &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); - - CLGEMMConfigArray configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32, - &ClGemmDefaultConfigReshapedValhall::configure_G78_f16, - &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); - - ConfigurationFunctionExecutorPtr func = nullptr; - - switch(_target) - { - case GPUTarget::G78: - func = configs_G78.get_function(data_type); - break; - case GPUTarget::G77: - default: - func = configs_G77.get_function(data_type); - break; - } - - ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); - return (this->*func)(m, n, k, b); -} - -std::pair ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(n <= 4) - { - return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, 0, 1, 0, 1); - } -} - -std::pair ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - const float r_mn = static_cast(m) / static_cast(n); - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - const float r_mk = static_cast(m) / static_cast(k); - const float r_nk = static_cast(n) / static_cast(k); - - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0); - - if(r_mk <= 0.11824845522642136) - { - if(workload <= 880.0) - { - return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); - } - else - { - if(r_nk <= 0.42521367967128754) - { - if(workload <= 1726.4000244140625) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - } - else - { - if(workload <= 1241.6000366210938) - { - return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0); - } - } - } - } - else - { - if(workload <= 11404.7998046875) - { - if(r_mk <= 1.0126488208770752) - { - if(r_mn <= 2.545312523841858) - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); - } - } - else - { - if(workload <= 2881.199951171875) - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - } - } - else - { - if(r_nk <= 0.5765306055545807) - { - if(r_mn <= 6.010416746139526) - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - } - } -} - -std::pair ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - const float r_mn = static_cast(m) / static_cast(n); - const float r_mk = static_cast(m) / static_cast(k); - const float r_nk = static_cast(n) / static_cast(k); - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - - if(workload <= 1288.0000f) - { - if(workload <= 505.6000f) - { - if(r_mn <= 0.4466f) - { - if(r_nk <= 0.2384f) - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0); - } - } - else - { - return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0); - } - } - else - { - if(r_mn <= 0.2250f) - { - if(r_mn <= 0.1599f) - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - } - else - { - if(r_mk <= 0.7609f) - { - if(r_mn <= 2.5453f) - { - if(workload <= 1089.6000f) - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 2, 4, 0, 0, 1, 0, 1); - } - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 16, 4, 4, 0, 0, 1, 0, 1); - } - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); - } - } - } - } - else - { - if(workload <= 5434.4001f) - { - if(workload <= 1603.2000f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - if(r_nk <= 0.6192f) - { - if(r_mn <= 16.1016f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - if(workload <= 2750.0000f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - if(r_mk <= 6.3151f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - } - } - } - else - { - if(r_mk <= 0.0387f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); - } - else - { - if(r_mk <= 2.5859f) - { - if(r_mk <= 0.2734f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - } - } - } - } - else - { - if(r_mk <= 25.7500f) - { - if(r_mk <= 0.3615f) - { - if(r_mn <= 0.0913f) - { - if(r_mk <= 0.0683f) - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); - } - } - else - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - } - else - { - if(workload <= 11174.3999f) - { - if(r_mk <= 0.8047f) - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - if(workload <= 7185.5999f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); - } - } - } - else - { - if(workload <= 17917.5000f) - { - if(r_mk <= 1.5078f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); - } - } - else - { - if(workload <= 34449.6016f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1); - } - } - } - } - } - else - { - if(r_mk <= 331.1111f) - { - if(workload <= 53397.5996f) - { - if(r_mn <= 57.8063f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); - } - } - else - { - if(r_nk <= 0.9211f) - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); - } - } - } - else - { - if(workload <= 38070.4004f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - } - } - } - } -} - -std::pair ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - const float r_mn = static_cast(m) / static_cast(n); - const float r_nk = static_cast(n) / static_cast(k); - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - - if(workload <= 801.6000f) - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); - } - else - { - if(r_mn <= 0.1211f) - { - if(workload <= 3296.0000f) - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - if(r_nk <= 1.0625f) - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1); - } - } - } - else - { - if(workload <= 5068.8000f) - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); - } - else - { - if(r_nk <= 0.2361f) - { - if(workload <= 12630.0000f) - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 1, 0, 0, 1, 0, 1); - } - } - else - { - if(workload <= 178790.3984f) - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); - } - } - } - } - } -} - -std::pair ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(n <= 4) - { - return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, 0, 1, 0, 1); - } -} -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h deleted file mode 100644 index 588cd64e0e..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H -#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** Valhall based OpenCL GEMMReshaped configuration */ -class ClGemmDefaultConfigReshapedValhall final : public IClGemmKernelConfig -{ -public: - /** Constructor - * - * @param[in] gpu GPU target - */ - ClGemmDefaultConfigReshapedValhall(GPUTarget gpu); - - // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; - -private: - std::pair configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h deleted file mode 100644 index c990c89a91..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H -#define ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" -#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h" -#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** CLGEMMReshaped factory class */ -class ClGemmReshapedKernelConfigurationFactory final -{ -public: - /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target - * - * @param[in] gpu GPU target - * - * @return CLGEMMReshaped kernel configuration class - */ - static std::unique_ptr create(GPUTarget gpu) - { - switch(get_arch_from_target(gpu)) - { - case GPUTarget::MIDGARD: - case GPUTarget::BIFROST: - return std::make_unique(gpu); - case GPUTarget::VALHALL: - return std::make_unique(gpu); - default: - ARM_COMPUTE_ERROR("Not supported GPU target"); - } - } -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp deleted file mode 100644 index 417d540468..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp +++ /dev/null @@ -1,547 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/GPUTarget.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/TensorShape.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -using namespace arm_compute::misc::shape_calculator; - -ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu) - : IClGemmKernelConfig(gpu) -{ -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) -{ - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); - - CLGEMMConfigArray configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); - - CLGEMMConfigArray configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); - - CLGEMMConfigArray configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8); - - CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); - - CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); - - ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) - { - case GPUTarget::G76: - func = configs_G76.get_function(data_type); - break; - case GPUTarget::G51: - func = configs_G51.get_function(data_type); - break; - case GPUTarget::G52: - func = configs_G52.get_function(data_type); - break; - case GPUTarget::G31: - func = configs_G31.get_function(data_type); - break; - default: - func = configs_G7x.get_function(data_type); - break; - } - - ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); - return (this->*func)(m, n, k, b); -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - if(n <= 2548) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false); - } - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true); - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - const unsigned int h0 = std::max(n / 2, 1U); - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); - } - else - { - const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); - if(m >= 28) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1); - } - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - - const bool is_workload_big = ((m * n * b) / 16) >= 2048; - - if(m == 1) - { - if(n >= 8192) - { - const unsigned int h0 = std::max(n / 4, 1U); - return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false); - } - else - { - const unsigned int h0 = std::max(n / 2, 1U); - if(n <= 204) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false); - } - } - } - else - { - const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); - if(is_workload_big) - { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); - } - else - { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); - } - } - - // Get lhs_info/rhs_info in case of OpenCL image - const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); - if(is_workload_big) - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); - } - - const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); - const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img); - const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32); - - // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d - const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true; - - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) - { - return std::make_pair(lhs_info_img, rhs_info_img); - } - else - { - return std::make_pair(lhs_info_buf, rhs_info_buf); - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - const float r_nk = static_cast(n) / static_cast(k); - - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - - if(m == 1) - { - if(r_nk <= 0.4664f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } - else - { - if(workload <= 274.4000f) - { - return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - const unsigned int n0 = n < 1280 ? 2 : 4; - const unsigned int h0 = std::max(n / n0, 1U); - return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - if(n > 2048) - { - const unsigned int h0 = std::max(n / 4, 1U); - return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); - } - else - { - const unsigned int h0 = std::max(n / 2, 1U); - return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); - } - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true); - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - const float r_mn = static_cast(m) / static_cast(n); - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - const float r_mk = static_cast(m) / static_cast(k); - const float r_nk = static_cast(n) / static_cast(k); - - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - - if(m == 1) - { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); - - if(r_mk <= 0.0026f) - { - if(r_nk <= 0.4664f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - } - else - { - if(r_mk <= 0.0148f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - } - } - else - { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); - - if(workload <= 362.6000f) - { - return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); - } - else - { - if(r_mn <= 22.6067f) - { - if(workload <= 708.8000f) - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - else - { - return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false); - } - } - else - { - if(r_nk <= 0.0917f) - { - return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); - } - else - { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - } - } - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - - if(m == 1) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); - } - else - { - const float r_mn = static_cast(m) / static_cast(n); - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - - if(workload <= 7449.60f) - { - if(workload <= 691.60f) - { - return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false); - } - else - { - if(workload <= 4155.20f) - { - return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); - } - else - { - return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false); - } - } - } - else - { - if(workload <= 16300.80f) - { - if(r_mn <= 44.56f) - { - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - else - { - return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); - } - } - else - { - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); - } - } - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - const unsigned int n0 = n < 1280 ? 2 : 4; - const unsigned int h0 = std::max(n / n0, 1U); - return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(dot8_supported(CLKernelLibrary::get().get_device())) - { - if(m == 1) - { - const unsigned int h0 = std::max(n / 2, 1U); - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); - } - else - { - const unsigned int h0 = std::max(n / 4, 1U); - return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true); - } - } - else - { - const int h0 = std::max(std::min(static_cast(n / 2), static_cast(128)), static_cast(1)); - if(m == 1) - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true); - } - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - const unsigned int h0 = std::max(n / 2, 1U); - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true); - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - const unsigned int h0 = std::max(n / 2, 1U); - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true); - } - else - { - const unsigned int h0 = std::max(n / 2, 1U); - return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true); - } -} - -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h deleted file mode 100644 index 98c8e53569..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H -#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */ -class ClGemmDefaultConfigReshapedRhsOnlyBifrost final : public IClGemmKernelConfig -{ -public: - /** Constructor - * - * @param[in] gpu GPU target - */ - ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu); - - // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; - -private: - std::pair configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp deleted file mode 100644 index 4c6e633896..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp +++ /dev/null @@ -1,570 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/GPUTarget.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/TensorShape.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -using namespace arm_compute::misc::shape_calculator; - -ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu) - : IClGemmKernelConfig(gpu) -{ -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) -{ - using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); - - CLGEMMConfigArray configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - - CLGEMMConfigArray configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - - ConfigurationFunctionExecutorPtr func = nullptr; - - switch(_target) - { - case GPUTarget::G78: - func = configs_G78.get_function(data_type); - break; - case GPUTarget::G77: - default: - func = configs_G77.get_function(data_type); - break; - } - - ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); - return (this->*func)(m, n, k, b); -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - if(m == 1) - { - const float r_mn = static_cast(m) / static_cast(n); - const float r_mk = static_cast(m) / static_cast(k); - - if(r_mk <= 0.0064484127797186375) - { - if(r_mn <= 0.0028273810748942196) - { - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - - const unsigned int h0 = std::max(n / 4, 1U); - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 1, 0, 0, 0); - } - } - else - { - if(r_mk <= 0.020312500186264515) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, 0, 1, 0, 1, 0); - } - } - } - else - { - const float r_mn = static_cast(m) / static_cast(n); - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - const float r_mk = static_cast(m) / static_cast(k); - - if(workload <= 1999.2000122070312) - { - if(workload <= 747.1999816894531) - { - return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); - } - else - { - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } - else - { - if(r_mn <= 0.03348214365541935) - { - if(r_mk <= 0.028125000186264515) - { - return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); - } - else - { - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } - else - { - GEMMLHSMatrixInfo lhs_info_buf; - GEMMRHSMatrixInfo rhs_info_buf; - GEMMLHSMatrixInfo lhs_info_img; - GEMMRHSMatrixInfo rhs_info_img; - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, 0, 1, 0, 0, 1); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0); - - return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); - } - } - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - const unsigned int h0 = std::max(n / 2, 1U); - if(n <= 836.0) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, 0, 1, 0, 1, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, 0, 1, 0, 1, 0); - } - } - else if(m < 128) - { - const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); - if(k >= 512) - { - return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0); - } - } - else - { - const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); - if(n >= 64) - { - return configure_lhs_rhs_info(m, n, 4, 8, 4, 1, h0, 0, 1, 0, 0); - } - else - { - if(k >= 512) - { - return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0); - } - } - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - ARM_COMPUTE_UNUSED(k); - ARM_COMPUTE_UNUSED(b); - - if(m == 1) - { - const unsigned int h0 = std::max(n / 2, 1U); - return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); - } - else - { - const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); - if(m >= 28) - { - return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 1); - } - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - const float r_mn = static_cast(m) / static_cast(n); - const float r_mk = static_cast(m) / static_cast(k); - const float r_nk = static_cast(n) / static_cast(k); - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - - if(m == 1) - { - if(workload <= 278.7000f) - { - if(workload <= 7.5000f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); - } - else - { - if(r_mn <= 0.0031f) - { - if(workload <= 256.6000f) - { - if(workload <= 16.7500f) - { - if(r_nk <= 1.6671f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); - } - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); - } - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); - } - } - else - { - if(r_mk <= 0.0027f) - { - if(r_mk <= 0.0014f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); - } - else - { - if(workload <= 8.9500f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); - } - } - } - else - { - if(workload <= 14.1500f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); - } - else - { - if(r_mk <= 0.0041f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); - } - } - } - } - } - } - else - { - if(workload <= 363.7000f) - { - if(r_mk <= 0.0031f) - { - return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 32, 0, 1, 0, 1, 0); - } - } - else - { - return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); - } - } - } - else - { - if(workload <= 1384.8000f) - { - if(workload <= 704.0000f) - { - return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1); - } - } - else - { - if(workload <= 16761.6006f) - { - if(r_mn <= 187.1250f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1); - } - } - else - { - if(r_mk <= 432.4630f) - { - return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 16, 0, 1, 0, 1, 1); - } - } - } - } -} - -std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) -{ - const float r_mn = static_cast(m) / static_cast(n); - const float r_mk = static_cast(m) / static_cast(k); - const float r_nk = static_cast(n) / static_cast(k); - const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; - - if(m == 1) - { - if(r_mn <= 0.0038f) - { - if(workload <= 353.9000f) - { - if(workload <= 278.7000f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); - } - else - { - if(r_mk <= 0.0004f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); - } - else - { - if(r_mk <= 0.0030f) - { - return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); - } - } - } - } - else - { - if(r_nk <= 1.9384f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1); - } - } - } - else - { - if(r_nk <= 1.0368f) - { - return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, 0, 0, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); - } - } - } - else - { - if(workload <= 1422.4000f) - { - if(workload <= 704.0000f) - { - return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0); - } - else - { - if(workload <= 1197.6000f) - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); - } - else - { - if(workload <= 1241.6000f) - { - return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); - } - } - } - } - else - { - if(workload <= 2769.6000f) - { - if(workload <= 1846.4000f) - { - if(r_mn <= 2.4927f) - { - return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); - } - } - else - { - if(r_mn <= 0.6261f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); - } - else - { - if(r_mk <= 3.4453f) - { - if(r_mn <= 1.4135f) - { - return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); - } - } - else - { - return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); - } - } - } - } - else - { - if(r_nk <= 0.0302f) - { - return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); - } - else - { - if(r_mk <= 181.3750f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); - } - else - { - if(workload <= 28035.2002f) - { - return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); - } - else - { - if(r_mk <= 808.6667f) - { - return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); - } - else - { - return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); - } - } - } - } - } - } - } -} -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h deleted file mode 100644 index 6a11ddb748..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H -#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */ -class ClGemmDefaultConfigReshapedRhsOnlyValhall final : public IClGemmKernelConfig -{ -public: - /** Constructor - * - * @param[in] gpu GPU target - */ - ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu); - - // Inherited overridden method - std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; - -private: - std::pair configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h deleted file mode 100644 index 8fd71276a0..0000000000 --- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H -#define ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H - -#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" -#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h" -#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace gemm -{ -/** CLGEMMReshapedOnlyRHS factory class */ -class ClGemmReshapedOnlyRhsKernelConfigurationFactory final -{ -public: - /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target - * - * @param[in] gpu GPU target - * - * @return CLGEMMReshapedOnlyRHS kernel configuration class - */ - static std::unique_ptr create(GPUTarget gpu) - { - switch(get_arch_from_target(gpu)) - { - case GPUTarget::MIDGARD: - case GPUTarget::BIFROST: - return std::make_unique(gpu); - case GPUTarget::VALHALL: - return std::make_unique(gpu); - default: - ARM_COMPUTE_ERROR("Not supported GPU target"); - } - } -}; -} // namespace gemm -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H */ diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h index e682973827..b1aee64d5d 100644 --- a/src/core/utils/AssemblyUtils.h +++ b/src/core/utils/AssemblyUtils.h @@ -26,7 +26,7 @@ #include "arm_compute/core/Types.h" #include "src/core/NEON/kernels/assembly/common.hpp" -#include "src/core/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/assembly/arm_gemm.hpp" namespace arm_compute { -- cgit v1.2.1