aboutsummaryrefslogtreecommitdiff
path: root/src/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu')
-rw-r--r--src/cpu/CpuContext.cpp28
-rw-r--r--src/cpu/CpuContext.h12
-rw-r--r--src/cpu/CpuQueue.cpp3
-rw-r--r--src/cpu/CpuQueue.h4
-rw-r--r--src/cpu/CpuTensor.cpp5
-rw-r--r--src/cpu/CpuTensor.h8
-rw-r--r--src/cpu/CpuTypes.h2
-rw-r--r--src/cpu/ICpuKernel.h13
-rw-r--r--src/cpu/kernels/CpuActivationKernel.cpp246
-rw-r--r--src/cpu/kernels/CpuActivationKernel.h10
-rw-r--r--src/cpu/kernels/CpuAddKernel.cpp233
-rw-r--r--src/cpu/kernels/CpuAddKernel.h12
-rw-r--r--src/cpu/kernels/CpuAddMulAddKernel.cpp99
-rw-r--r--src/cpu/kernels/CpuAddMulAddKernel.h40
-rw-r--r--src/cpu/kernels/CpuCastKernel.cpp1346
-rw-r--r--src/cpu/kernels/CpuCastKernel.h7
-rw-r--r--src/cpu/kernels/CpuCol2ImKernel.cpp27
-rw-r--r--src/cpu/kernels/CpuCol2ImKernel.h3
-rw-r--r--src/cpu/kernels/CpuConcatenateBatchKernel.cpp126
-rw-r--r--src/cpu/kernels/CpuConcatenateBatchKernel.h6
-rw-r--r--src/cpu/kernels/CpuConcatenateDepthKernel.cpp126
-rw-r--r--src/cpu/kernels/CpuConcatenateDepthKernel.h6
-rw-r--r--src/cpu/kernels/CpuConcatenateHeightKernel.cpp120
-rw-r--r--src/cpu/kernels/CpuConcatenateHeightKernel.h4
-rw-r--r--src/cpu/kernels/CpuConcatenateWidthKernel.cpp118
-rw-r--r--src/cpu/kernels/CpuConcatenateWidthKernel.h4
-rw-r--r--src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp32
-rw-r--r--src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h20
-rw-r--r--src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp55
-rw-r--r--src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h2
-rw-r--r--src/cpu/kernels/CpuCopyKernel.cpp61
-rw-r--r--src/cpu/kernels/CpuCopyKernel.h2
-rw-r--r--src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp130
-rw-r--r--src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h24
-rw-r--r--src/cpu/kernels/CpuDequantizeKernel.cpp234
-rw-r--r--src/cpu/kernels/CpuDequantizeKernel.h2
-rw-r--r--src/cpu/kernels/CpuDirectConv2dKernel.cpp62
-rw-r--r--src/cpu/kernels/CpuDirectConv2dKernel.h14
-rw-r--r--src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp388
-rw-r--r--src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h34
-rw-r--r--src/cpu/kernels/CpuDirectConv3dKernel.cpp87
-rw-r--r--src/cpu/kernels/CpuDirectConv3dKernel.h20
-rw-r--r--src/cpu/kernels/CpuElementwiseKernel.cpp471
-rw-r--r--src/cpu/kernels/CpuElementwiseKernel.h13
-rw-r--r--src/cpu/kernels/CpuElementwiseUnaryKernel.cpp90
-rw-r--r--src/cpu/kernels/CpuElementwiseUnaryKernel.h11
-rw-r--r--src/cpu/kernels/CpuFillKernel.cpp22
-rw-r--r--src/cpu/kernels/CpuFillKernel.h3
-rw-r--r--src/cpu/kernels/CpuFloorKernel.cpp39
-rw-r--r--src/cpu/kernels/CpuFloorKernel.h4
-rw-r--r--src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp56
-rw-r--r--src/cpu/kernels/CpuGemmInterleave4x4Kernel.h2
-rw-r--r--src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp1280
-rw-r--r--src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h4
-rw-r--r--src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp316
-rw-r--r--src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h29
-rw-r--r--src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp416
-rw-r--r--src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h23
-rw-r--r--src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp748
-rw-r--r--src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h28
-rw-r--r--src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp247
-rw-r--r--src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h19
-rw-r--r--src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp153
-rw-r--r--src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h29
-rw-r--r--src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp183
-rw-r--r--src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h32
-rw-r--r--src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp181
-rw-r--r--src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h32
-rw-r--r--src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp39
-rw-r--r--src/cpu/kernels/CpuGemmMatrixAdditionKernel.h6
-rw-r--r--src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp85
-rw-r--r--src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h23
-rw-r--r--src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp43
-rw-r--r--src/cpu/kernels/CpuGemmTranspose1xWKernel.h2
-rw-r--r--src/cpu/kernels/CpuIm2ColKernel.cpp288
-rw-r--r--src/cpu/kernels/CpuIm2ColKernel.h35
-rw-r--r--src/cpu/kernels/CpuKernelSelectionTypes.h31
-rw-r--r--src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp75
-rw-r--r--src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h13
-rw-r--r--src/cpu/kernels/CpuMulKernel.cpp1772
-rw-r--r--src/cpu/kernels/CpuMulKernel.h40
-rw-r--r--src/cpu/kernels/CpuPermuteKernel.cpp155
-rw-r--r--src/cpu/kernels/CpuPermuteKernel.h2
-rw-r--r--src/cpu/kernels/CpuPool2dKernel.cpp329
-rw-r--r--src/cpu/kernels/CpuPool2dKernel.h20
-rw-r--r--src/cpu/kernels/CpuPool3dKernel.cpp75
-rw-r--r--src/cpu/kernels/CpuPool3dKernel.h10
-rw-r--r--src/cpu/kernels/CpuQuantizeKernel.cpp167
-rw-r--r--src/cpu/kernels/CpuQuantizeKernel.h8
-rw-r--r--src/cpu/kernels/CpuReshapeKernel.cpp65
-rw-r--r--src/cpu/kernels/CpuReshapeKernel.h7
-rw-r--r--src/cpu/kernels/CpuScaleKernel.cpp607
-rw-r--r--src/cpu/kernels/CpuScaleKernel.h69
-rw-r--r--src/cpu/kernels/CpuSoftmaxKernel.cpp193
-rw-r--r--src/cpu/kernels/CpuSoftmaxKernel.h23
-rw-r--r--src/cpu/kernels/CpuSubKernel.cpp109
-rw-r--r--src/cpu/kernels/CpuSubKernel.h10
-rw-r--r--src/cpu/kernels/CpuTransposeKernel.cpp772
-rw-r--r--src/cpu/kernels/CpuTransposeKernel.h2
-rw-r--r--src/cpu/kernels/CpuWeightsReshapeKernel.cpp86
-rw-r--r--src/cpu/kernels/CpuWeightsReshapeKernel.h2
-rw-r--r--src/cpu/kernels/CpuWinogradConv2dKernel.cpp59
-rw-r--r--src/cpu/kernels/CpuWinogradConv2dKernel.h11
-rw-r--r--src/cpu/kernels/activation/generic/neon/fp16.cpp4
-rw-r--r--src/cpu/kernels/activation/generic/neon/fp32.cpp2
-rw-r--r--src/cpu/kernels/activation/generic/neon/impl.h274
-rw-r--r--src/cpu/kernels/activation/generic/neon/lut.cpp20
-rw-r--r--src/cpu/kernels/activation/generic/neon/qasymm8.cpp344
-rw-r--r--src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp300
-rw-r--r--src/cpu/kernels/activation/generic/neon/qsymm16.cpp163
-rw-r--r--src/cpu/kernels/activation/generic/sve/fp16.cpp148
-rw-r--r--src/cpu/kernels/activation/generic/sve/fp32.cpp149
-rw-r--r--src/cpu/kernels/activation/generic/sve2/lut.cpp20
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qasymm8.cpp264
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp306
-rw-r--r--src/cpu/kernels/activation/generic/sve2/qsymm16.cpp121
-rw-r--r--src/cpu/kernels/add/generic/neon/fp16.cpp5
-rw-r--r--src/cpu/kernels/add/generic/neon/fp32.cpp5
-rw-r--r--src/cpu/kernels/add/generic/neon/impl.cpp711
-rw-r--r--src/cpu/kernels/add/generic/neon/impl.h145
-rw-r--r--src/cpu/kernels/add/generic/neon/integer.cpp11
-rw-r--r--src/cpu/kernels/add/generic/neon/qasymm8.cpp6
-rw-r--r--src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp6
-rw-r--r--src/cpu/kernels/add/generic/neon/qsymm16.cpp162
-rw-r--r--src/cpu/kernels/add/generic/sve/fp16.cpp5
-rw-r--r--src/cpu/kernels/add/generic/sve/fp32.cpp6
-rw-r--r--src/cpu/kernels/add/generic/sve/impl.cpp106
-rw-r--r--src/cpu/kernels/add/generic/sve/impl.h3
-rw-r--r--src/cpu/kernels/add/generic/sve/integer.cpp12
-rw-r--r--src/cpu/kernels/add/generic/sve2/qasymm8.cpp237
-rw-r--r--src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp193
-rw-r--r--src/cpu/kernels/add/generic/sve2/qsymm16.cpp119
-rw-r--r--src/cpu/kernels/add/list.h7
-rw-r--r--src/cpu/kernels/addmuladd/generic/neon/fp16.cpp106
-rw-r--r--src/cpu/kernels/addmuladd/generic/neon/fp32.cpp104
-rw-r--r--src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp137
-rw-r--r--src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp137
-rw-r--r--src/cpu/kernels/addmuladd/list.h5
-rw-r--r--src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h12
-rw-r--r--src/cpu/kernels/assembly/arm_gemm.hpp91
-rw-r--r--src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp38
-rw-r--r--src/cpu/kernels/assembly/gemm_common.hpp74
-rw-r--r--src/cpu/kernels/assembly/ndrange.hpp19
-rw-r--r--src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp6
-rw-r--r--src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp6
-rw-r--r--src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp85
-rw-r--r--src/cpu/kernels/boundingboxtransform/generic/neon/impl.h85
-rw-r--r--src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp6
-rw-r--r--src/cpu/kernels/boundingboxtransform/list.h5
-rw-r--r--src/cpu/kernels/cast/generic/neon/fp16.cpp393
-rw-r--r--src/cpu/kernels/cast/list.h7
-rw-r--r--src/cpu/kernels/conv3d/neon/list.h165
-rw-r--r--src/cpu/kernels/conv3d/neon/quantized.h257
-rw-r--r--src/cpu/kernels/crop/generic/neon/crop_helper.h4
-rw-r--r--src/cpu/kernels/crop/generic/neon/fp16.cpp17
-rw-r--r--src/cpu/kernels/crop/generic/neon/fp32.cpp17
-rw-r--r--src/cpu/kernels/crop/generic/neon/impl.h49
-rw-r--r--src/cpu/kernels/crop/generic/neon/integer.cpp92
-rw-r--r--src/cpu/kernels/crop/list.h6
-rw-r--r--src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp11
-rw-r--r--src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp11
-rw-r--r--src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp592
-rw-r--r--src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h279
-rw-r--r--src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp20
-rw-r--r--src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp20
-rw-r--r--src/cpu/kernels/depthwiseconv2d/list.h6
-rw-r--r--src/cpu/kernels/directconv2d/list.h5
-rw-r--r--src/cpu/kernels/directconv2d/nchw/all.cpp142
-rw-r--r--src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp5
-rw-r--r--src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp252
-rw-r--r--src/cpu/kernels/directconv2d/nhwc/neon/impl.h4
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp73
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp73
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/impl.h852
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp173
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp76
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp74
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp73
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp71
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp250
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/impl.h16
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp171
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve2/impl.h379
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp76
-rw-r--r--src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp74
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/impl.h244
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp21
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp4
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp4
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp44
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp6
-rw-r--r--src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp20
-rw-r--r--src/cpu/kernels/floor/list.h3
-rw-r--r--src/cpu/kernels/floor/neon/fp16.cpp4
-rw-r--r--src/cpu/kernels/floor/neon/fp32.cpp4
-rw-r--r--src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp16
-rw-r--r--src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp16
-rw-r--r--src/cpu/kernels/fuse_batch_normalization/generic/impl.h120
-rw-r--r--src/cpu/kernels/fuse_batch_normalization/list.h15
-rw-r--r--src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp147
-rw-r--r--src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp16
-rw-r--r--src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp16
-rw-r--r--src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h143
-rw-r--r--src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp44
-rw-r--r--src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp49
-rw-r--r--src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp472
-rw-r--r--src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp15
-rw-r--r--src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp898
-rw-r--r--src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h7
-rw-r--r--src/cpu/kernels/gemm_matrix_mul/list.h5
-rw-r--r--src/cpu/kernels/genproposals/generic/neon/fp16.cpp7
-rw-r--r--src/cpu/kernels/genproposals/generic/neon/fp32.cpp7
-rw-r--r--src/cpu/kernels/genproposals/generic/neon/impl.cpp43
-rw-r--r--src/cpu/kernels/genproposals/generic/neon/impl.h41
-rw-r--r--src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp7
-rw-r--r--src/cpu/kernels/instancenorm/generic/neon/fp16.cpp201
-rw-r--r--src/cpu/kernels/instancenorm/generic/neon/fp32.cpp8
-rw-r--r--src/cpu/kernels/instancenorm/generic/neon/impl.cpp173
-rw-r--r--src/cpu/kernels/instancenorm/generic/neon/impl.h6
-rw-r--r--src/cpu/kernels/instancenorm/list.h5
-rw-r--r--src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp165
-rw-r--r--src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h23
-rw-r--r--src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp99
-rw-r--r--src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h13
-rw-r--r--src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp6
-rw-r--r--src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp10
-rw-r--r--src/cpu/kernels/l2normlayer/generic/neon/impl.h96
-rw-r--r--src/cpu/kernels/l2normlayer/list.h5
-rw-r--r--src/cpu/kernels/lut/generic/neon/u8.cpp736
-rw-r--r--src/cpu/kernels/lut/generic/sve2/u8.cpp10
-rw-r--r--src/cpu/kernels/lut/list.h10
-rw-r--r--src/cpu/kernels/maxunpool/generic/neon/impl.h17
-rw-r--r--src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp102
-rw-r--r--src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp97
-rw-r--r--src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp124
-rw-r--r--src/cpu/kernels/pool2d/neon/fp16.cpp442
-rw-r--r--src/cpu/kernels/pool2d/neon/fp32.cpp562
-rw-r--r--src/cpu/kernels/pool2d/neon/list.h38
-rw-r--r--src/cpu/kernels/pool2d/neon/nchw/all.cpp1097
-rw-r--r--src/cpu/kernels/pool2d/neon/qasymm8.cpp12
-rw-r--r--src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp12
-rw-r--r--src/cpu/kernels/pool2d/neon/quantized.h961
-rw-r--r--src/cpu/kernels/pool3d/neon/impl.h417
-rw-r--r--src/cpu/kernels/pool3d/neon/quantized.h403
-rw-r--r--src/cpu/kernels/range/generic/neon/fp16.cpp4
-rw-r--r--src/cpu/kernels/range/generic/neon/fp32.cpp4
-rw-r--r--src/cpu/kernels/range/generic/neon/impl.h48
-rw-r--r--src/cpu/kernels/range/list.h3
-rw-r--r--src/cpu/kernels/roialign/generic/neon/fp16.cpp7
-rw-r--r--src/cpu/kernels/roialign/generic/neon/fp32.cpp7
-rw-r--r--src/cpu/kernels/roialign/generic/neon/impl.h192
-rw-r--r--src/cpu/kernels/roialign/generic/neon/qasymm8.cpp7
-rw-r--r--src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp7
-rw-r--r--src/cpu/kernels/roialign/list.h6
-rw-r--r--src/cpu/kernels/scale/neon/fp16.cpp189
-rw-r--r--src/cpu/kernels/scale/neon/integer.cpp416
-rw-r--r--src/cpu/kernels/scale/neon/list.h163
-rw-r--r--src/cpu/kernels/scale/neon/qasymm8.cpp217
-rw-r--r--src/cpu/kernels/scale/neon/qasymm8_signed.cpp206
-rw-r--r--src/cpu/kernels/scale/sve/fp16.cpp75
-rw-r--r--src/cpu/kernels/scale/sve/fp32.cpp76
-rw-r--r--src/cpu/kernels/scale/sve/integer.cpp145
-rw-r--r--src/cpu/kernels/scale/sve/list.h8
-rw-r--r--src/cpu/kernels/scale/sve/qasymm8.cpp74
-rw-r--r--src/cpu/kernels/scale/sve/qasymm8_signed.cpp74
-rw-r--r--src/cpu/kernels/select/generic/neon/fp16.cpp12
-rw-r--r--src/cpu/kernels/select/generic/neon/fp32.cpp10
-rw-r--r--src/cpu/kernels/select/generic/neon/impl.h111
-rw-r--r--src/cpu/kernels/select/generic/neon/integer.cpp40
-rw-r--r--src/cpu/kernels/softmax/generic/neon/fp16.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/neon/fp32.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/neon/impl.cpp281
-rw-r--r--src/cpu/kernels/softmax/generic/neon/impl.h248
-rw-r--r--src/cpu/kernels/softmax/generic/neon/qasymm8.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/sve/fp16.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/sve/fp32.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/sve/impl.cpp211
-rw-r--r--src/cpu/kernels/softmax/generic/sve/impl.h9
-rw-r--r--src/cpu/kernels/softmax/generic/sve/qasymm8.cpp3
-rw-r--r--src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp3
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/impl.cpp289
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/impl.h9
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp12
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp12
-rw-r--r--src/cpu/kernels/softmax/list.h9
-rw-r--r--src/cpu/kernels/sub/neon/list.h119
-rw-r--r--src/cpu/kernels/sub/neon/qasymm8.cpp9
-rw-r--r--src/cpu/kernels/sub/neon/qasymm8_signed.cpp9
-rw-r--r--src/cpu/kernels/sub/neon/qsymm16.cpp166
-rw-r--r--src/cpu/operators/CpuActivation.cpp14
-rw-r--r--src/cpu/operators/CpuActivation.h1
-rw-r--r--src/cpu/operators/CpuAdd.cpp17
-rw-r--r--src/cpu/operators/CpuAdd.h13
-rw-r--r--src/cpu/operators/CpuAddMulAdd.cpp85
-rw-r--r--src/cpu/operators/CpuAddMulAdd.h26
-rw-r--r--src/cpu/operators/CpuCast.cpp3
-rw-r--r--src/cpu/operators/CpuConcatenate.cpp34
-rw-r--r--src/cpu/operators/CpuConcatenate.h4
-rw-r--r--src/cpu/operators/CpuConv2d.cpp140
-rw-r--r--src/cpu/operators/CpuConv2d.h40
-rw-r--r--src/cpu/operators/CpuConvertFullyConnectedWeights.cpp13
-rw-r--r--src/cpu/operators/CpuConvertFullyConnectedWeights.h8
-rw-r--r--src/cpu/operators/CpuCopy.cpp3
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2d.cpp157
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2d.h86
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp29
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h17
-rw-r--r--src/cpu/operators/CpuDequantize.cpp1
-rw-r--r--src/cpu/operators/CpuDirectConv2d.cpp50
-rw-r--r--src/cpu/operators/CpuDirectConv2d.h24
-rw-r--r--src/cpu/operators/CpuDirectConv3d.cpp27
-rw-r--r--src/cpu/operators/CpuDirectConv3d.h16
-rw-r--r--src/cpu/operators/CpuElementwise.cpp18
-rw-r--r--src/cpu/operators/CpuElementwise.h5
-rw-r--r--src/cpu/operators/CpuElementwiseUnary.cpp5
-rw-r--r--src/cpu/operators/CpuElementwiseUnary.h3
-rw-r--r--src/cpu/operators/CpuFill.cpp3
-rw-r--r--src/cpu/operators/CpuFill.h1
-rw-r--r--src/cpu/operators/CpuFlatten.cpp6
-rw-r--r--src/cpu/operators/CpuFloor.cpp3
-rw-r--r--src/cpu/operators/CpuFullyConnected.cpp225
-rw-r--r--src/cpu/operators/CpuFullyConnected.h52
-rw-r--r--src/cpu/operators/CpuGemm.cpp198
-rw-r--r--src/cpu/operators/CpuGemm.h66
-rw-r--r--src/cpu/operators/CpuGemmConv2d.cpp378
-rw-r--r--src/cpu/operators/CpuGemmConv2d.h78
-rw-r--r--src/cpu/operators/CpuGemmDirectConv2d.cpp85
-rw-r--r--src/cpu/operators/CpuGemmDirectConv2d.h17
-rw-r--r--src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp365
-rw-r--r--src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h17
-rw-r--r--src/cpu/operators/CpuGemmLowpOutputStage.cpp52
-rw-r--r--src/cpu/operators/CpuGemmLowpOutputStage.h6
-rw-r--r--src/cpu/operators/CpuMatMul.cpp113
-rw-r--r--src/cpu/operators/CpuMatMul.h34
-rw-r--r--src/cpu/operators/CpuMaxUnpooling.cpp13
-rw-r--r--src/cpu/operators/CpuMaxUnpooling.h8
-rw-r--r--src/cpu/operators/CpuMul.cpp27
-rw-r--r--src/cpu/operators/CpuMul.h25
-rw-r--r--src/cpu/operators/CpuPermute.cpp5
-rw-r--r--src/cpu/operators/CpuPool2d.cpp35
-rw-r--r--src/cpu/operators/CpuPool2d.h11
-rw-r--r--src/cpu/operators/CpuPool3d.cpp6
-rw-r--r--src/cpu/operators/CpuPool3d.h3
-rw-r--r--src/cpu/operators/CpuQuantize.cpp1
-rw-r--r--src/cpu/operators/CpuReshape.cpp7
-rw-r--r--src/cpu/operators/CpuReshape.h5
-rw-r--r--src/cpu/operators/CpuScale.cpp128
-rw-r--r--src/cpu/operators/CpuScale.h9
-rw-r--r--src/cpu/operators/CpuSoftmax.cpp101
-rw-r--r--src/cpu/operators/CpuSoftmax.h6
-rw-r--r--src/cpu/operators/CpuSub.cpp17
-rw-r--r--src/cpu/operators/CpuSub.h13
-rw-r--r--src/cpu/operators/CpuTranspose.cpp5
-rw-r--r--src/cpu/operators/CpuWinogradConv2d.cpp263
-rw-r--r--src/cpu/operators/CpuWinogradConv2d.h62
-rw-r--r--src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp393
-rw-r--r--src/cpu/operators/internal/CpuGemmAssemblyDispatch.h55
-rw-r--r--src/cpu/utils/CpuAuxTensorHandler.h26
364 files changed, 21336 insertions, 17300 deletions
diff --git a/src/cpu/CpuContext.cpp b/src/cpu/CpuContext.cpp
index 7c14891ef8..b745af8229 100644
--- a/src/cpu/CpuContext.cpp
+++ b/src/cpu/CpuContext.cpp
@@ -24,6 +24,7 @@
#include "src/cpu/CpuContext.h"
#include "arm_compute/core/CPP/CPPTypes.h"
+
#include "src/cpu/CpuQueue.h"
#include "src/cpu/CpuTensor.h"
@@ -32,7 +33,7 @@
#include <malloc.h>
#if defined(_WIN64)
-#define posix_memalign _aligned_realloc
+#define posix_memalign _aligned_realloc
#define posix_memalign_free _aligned_free
#endif // defined(_WIN64)
#endif // !defined(__APPLE__) && !defined(__OpenBSD__)
@@ -66,7 +67,7 @@ void *default_aligned_allocate(void *user_data, size_t size, size_t alignment)
size_t real_size = (rem) ? (size + alignment - rem) : size;
ptr = memalign(alignment, real_size);
#else /* defined(BARE_METAL) */
- if(posix_memalign(&ptr, alignment, size) != 0)
+ if (posix_memalign(&ptr, alignment, size) != 0)
{
// posix_memalign returns non-zero on failures, the return values will be
// - EINVAL: wrong alignment
@@ -81,17 +82,13 @@ void default_aligned_free(void *user_data, void *ptr)
ARM_COMPUTE_UNUSED(user_data);
free(ptr);
}
-static AclAllocator default_allocator = { &default_allocate,
- &default_free,
- &default_aligned_allocate,
- &default_aligned_free,
- nullptr
- };
+static AclAllocator default_allocator = {&default_allocate, &default_free, &default_aligned_allocate,
+ &default_aligned_free, nullptr};
AllocatorWrapper populate_allocator(AclAllocator *external_allocator)
{
bool is_valid = (external_allocator != nullptr);
- if(is_valid)
+ if (is_valid)
{
is_valid = is_valid && (external_allocator->alloc != nullptr);
is_valid = is_valid && (external_allocator->free != nullptr);
@@ -123,14 +120,13 @@ cpuinfo::CpuIsaInfo populate_capabilities_flags(AclTargetCapabilities external_c
return isa_caps;
}
-CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps,
- int32_t max_threads)
+CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, int32_t max_threads)
{
CpuCapabilities caps;
// Populate capabilities with system information
caps.cpu_info = cpuinfo::CpuInfo::build();
- if(external_caps != AclCpuCapabilitiesAuto)
+ if (external_caps != AclCpuCapabilitiesAuto)
{
cpuinfo::CpuIsaInfo isa = populate_capabilities_flags(external_caps);
auto cpus = caps.cpu_info.cpus();
@@ -151,11 +147,9 @@ CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps,
} // namespace
CpuContext::CpuContext(const AclContextOptions *options)
- : IContext(Target::Cpu),
- _allocator(default_allocator),
- _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1))
+ : IContext(Target::Cpu), _allocator(default_allocator), _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1))
{
- if(options != nullptr)
+ if (options != nullptr)
{
_allocator = populate_allocator(options->allocator);
_caps = populate_capabilities(options->capabilities, options->max_compute_units);
@@ -175,7 +169,7 @@ AllocatorWrapper &CpuContext::allocator()
ITensorV2 *CpuContext::create_tensor(const AclTensorDescriptor &desc, bool allocate)
{
CpuTensor *tensor = new CpuTensor(this, desc);
- if(tensor != nullptr && allocate)
+ if (tensor != nullptr && allocate)
{
tensor->allocate();
}
diff --git a/src/cpu/CpuContext.h b/src/cpu/CpuContext.h
index da241ed097..0c8ae49f49 100644
--- a/src/cpu/CpuContext.h
+++ b/src/cpu/CpuContext.h
@@ -25,8 +25,8 @@
#define SRC_CPU_CPUCONTEXT_H
#include "src/common/AllocatorWrapper.h"
-#include "src/common/IContext.h"
#include "src/common/cpuinfo/CpuInfo.h"
+#include "src/common/IContext.h"
namespace arm_compute
{
@@ -36,7 +36,7 @@ namespace cpu
struct CpuCapabilities
{
cpuinfo::CpuInfo cpu_info{};
- int32_t max_threads{ -1 };
+ int32_t max_threads{-1};
};
/** CPU context implementation class */
@@ -60,9 +60,9 @@ public:
AllocatorWrapper &allocator();
// Inherrited methods overridden
- ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
- IQueue *create_queue(const AclQueueOptions *options) override;
- std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+ ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+ IQueue *create_queue(const AclQueueOptions *options) override;
+ std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
const AclTensorDescriptor &dst,
const AclActivationDescriptor &act,
bool is_validate) override;
@@ -74,4 +74,4 @@ private:
} // namespace cpu
} // namespace arm_compute
-#endif /* SRC_CPU_CPUCONTEXT_H */ \ No newline at end of file
+#endif /* SRC_CPU_CPUCONTEXT_H */
diff --git a/src/cpu/CpuQueue.cpp b/src/cpu/CpuQueue.cpp
index 0f0097b3f4..be781d6794 100644
--- a/src/cpu/CpuQueue.cpp
+++ b/src/cpu/CpuQueue.cpp
@@ -29,8 +29,7 @@ namespace arm_compute
{
namespace cpu
{
-CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options)
- : IQueue(ctx)
+CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx)
{
ARM_COMPUTE_UNUSED(options);
}
diff --git a/src/cpu/CpuQueue.h b/src/cpu/CpuQueue.h
index 871a36c85b..b6a2be0e23 100644
--- a/src/cpu/CpuQueue.h
+++ b/src/cpu/CpuQueue.h
@@ -24,10 +24,10 @@
#ifndef SRC_CPU_CPUQUEUE_H
#define SRC_CPU_CPUQUEUE_H
-#include "src/common/IQueue.h"
-
#include "arm_compute/runtime/IScheduler.h"
+#include "src/common/IQueue.h"
+
namespace arm_compute
{
namespace cpu
diff --git a/src/cpu/CpuTensor.cpp b/src/cpu/CpuTensor.cpp
index 6dd6d9c31b..59082b5350 100644
--- a/src/cpu/CpuTensor.cpp
+++ b/src/cpu/CpuTensor.cpp
@@ -29,8 +29,7 @@ namespace arm_compute
{
namespace cpu
{
-CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc)
- : ITensorV2(ctx), _legacy_tensor()
+CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor()
{
ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::Cpu));
_legacy_tensor = std::make_unique<Tensor>();
@@ -41,7 +40,7 @@ void *CpuTensor::map()
{
ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
- if(_legacy_tensor == nullptr)
+ if (_legacy_tensor == nullptr)
{
ARM_COMPUTE_LOG_ERROR_ACL("[CpuTensor:map]: Backing tensor does not exist!");
return nullptr;
diff --git a/src/cpu/CpuTensor.h b/src/cpu/CpuTensor.h
index b078774c99..89931e1f94 100644
--- a/src/cpu/CpuTensor.h
+++ b/src/cpu/CpuTensor.h
@@ -24,10 +24,10 @@
#ifndef SRC_CPU_CPUTENSOR_H
#define SRC_CPU_CPUTENSOR_H
-#include "src/common/ITensorV2.h"
-
#include "arm_compute/runtime/Tensor.h"
+#include "src/common/ITensorV2.h"
+
namespace arm_compute
{
namespace cpu
@@ -52,7 +52,7 @@ public:
void *map() override;
StatusCode unmap() override;
arm_compute::ITensor *tensor() const override;
- StatusCode import(void *handle, ImportMemoryType type) override;
+ StatusCode import(void *handle, ImportMemoryType type) override;
private:
std::unique_ptr<Tensor> _legacy_tensor;
@@ -60,4 +60,4 @@ private:
} // namespace cpu
} // namespace arm_compute
-#endif /* SRC_CPU_CPUTENSOR_H */ \ No newline at end of file
+#endif /* SRC_CPU_CPUTENSOR_H */
diff --git a/src/cpu/CpuTypes.h b/src/cpu/CpuTypes.h
index 0f7b9b6552..8726bc470a 100644
--- a/src/cpu/CpuTypes.h
+++ b/src/cpu/CpuTypes.h
@@ -31,6 +31,6 @@ namespace arm_compute
typedef __fp16 float16_t;
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
typedef float float32_t;
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_CPUTYPES */
diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h
index 8f4106240d..bcd0cb2c70 100644
--- a/src/cpu/ICpuKernel.h
+++ b/src/cpu/ICpuKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_ICPUKERNEL_H
#include "arm_compute/core/CPP/ICPPKernel.h"
+
#include "src/cpu/kernels/CpuKernelSelectionTypes.h"
namespace arm_compute
@@ -34,7 +35,7 @@ namespace cpu
enum class KernelSelectionType
{
Preferred, /**< Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags */
- Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
+ Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
};
template <class Derived>
@@ -50,13 +51,15 @@ public:
*/
template <typename SelectorType>
- static const auto *get_implementation(const SelectorType &selector, KernelSelectionType selection_type = KernelSelectionType::Supported)
+ static const auto *get_implementation(const SelectorType &selector,
+ KernelSelectionType selection_type = KernelSelectionType::Supported)
{
- using kernel_type = typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type;
+ using kernel_type =
+ typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type;
- for(const auto &uk : Derived::get_available_kernels())
+ for (const auto &uk : Derived::get_available_kernels())
{
- if(uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr))
+ if (uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr))
{
return &uk;
}
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index f4bd4e6cad..50bf672d3c 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -26,11 +26,11 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+
+#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
#include "src/cpu/kernels/activation/list.h"
#include <array>
@@ -43,126 +43,126 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels =
-{
+static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = {
#ifdef ARM_COMPUTE_ENABLE_SVE
- {
- "sve2_q8_activation_lut",
- [](const ActivationDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.cpumodel == CPUModel::A510 && data.isa.sve2; },
- REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)
- },
+ {"sve2_q8_activation_lut",
+ [](const ActivationDataTypeISASelectorData &data)
+ {
+ return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
+ data.cpumodel == CPUModel::A510 && data.isa.sve2;
+ },
+ REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
#endif // ARM_COMPUTE_ENABLE_SVE
#ifdef __aarch64__
- {
- // Neon LUT implementantion takes precedence
- "neon_q8_activation_lut",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)
- },
+ {// Neon LUT implementantion takes precedence
+ "neon_q8_activation_lut",
+ [](const ActivationDataTypeISASelectorData &data)
+ { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
#endif // __aarch64__
- {
- "sve2_qu8_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
- REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)
- },
- {
- "sve2_qs8_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
- REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)
- },
- {
- "sve2_qs16_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
- REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)
- },
- {
- "sve_fp16_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
- REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)
- },
- {
- "sve_fp32_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
- REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)
- },
- {
- "neon_fp16_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)
- },
- {
- "neon_fp32_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)
- },
- {
- "neon_qu8_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)
- },
- {
- "neon_qs8_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)
- },
- {
- "neon_qs16_activation",
- [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16; },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)
- },
+ {"sve2_qu8_activation",
+ [](const ActivationDataTypeISASelectorData &data) {
+ return data.dt == DataType::QASYMM8 && data.isa.sve2 &&
+ data.f != ActivationLayerInfo::ActivationFunction::GELU;
+ },
+ REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)},
+ {"sve2_qs8_activation",
+ [](const ActivationDataTypeISASelectorData &data)
+ {
+ return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 &&
+ data.f != ActivationLayerInfo::ActivationFunction::GELU;
+ },
+ REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)},
+ {"sve2_qs16_activation",
+ [](const ActivationDataTypeISASelectorData &data) {
+ return data.dt == DataType::QSYMM16 && data.isa.sve2 &&
+ data.f != ActivationLayerInfo::ActivationFunction::GELU;
+ },
+ REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
+ {"sve_fp16_activation",
+ [](const ActivationDataTypeISASelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+ data.f != ActivationLayerInfo::ActivationFunction::GELU;
+ },
+ REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)},
+ {"sve_fp32_activation",
+ [](const ActivationDataTypeISASelectorData &data)
+ { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+ REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)},
+ {"neon_fp16_activation",
+ [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)},
+ {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)},
+ {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)},
+ {"neon_qs8_activation",
+ [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)},
+ {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; },
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)},
};
/* Supported activation in the 8-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations =
-{
- ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LOGISTIC,
- ActivationLayerInfo::ActivationFunction::TANH,
- ActivationLayerInfo::ActivationFunction::HARD_SWISH,
- ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
- ActivationLayerInfo::ActivationFunction::GELU,
+static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations = {
+ ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+ ActivationLayerInfo::ActivationFunction::TANH, ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+ ActivationLayerInfo::ActivationFunction::LEAKY_RELU, ActivationLayerInfo::ActivationFunction::GELU,
};
/* Supported activation in the 16-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations =
-{
- ActivationLayerInfo::ActivationFunction::LOGISTIC,
- ActivationLayerInfo::ActivationFunction::TANH,
- ActivationLayerInfo::ActivationFunction::HARD_SWISH,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-};
+static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations = {
+ ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH,
+ ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::QSYMM16, DataType::F16, DataType::F32);
- const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() });
+ const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
- const DataType data_type = src->data_type();
- const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
- const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
+ const DataType data_type = src->data_type();
+ const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+ const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)),
- "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ is_data_type_quantized_asymmetric(data_type) &&
+ (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) ==
+ std::end(qasymm8_activations)),
+ "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) &&
+ (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations),
+ f_act) == std::end(qsymm16_activations)),
"For QSYMM16 only tanh and logistic are supported");
- ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH)
- && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
- ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
- ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
- ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
-
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+ ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+ (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+ (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+ ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+ (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+ (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+ (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+ (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+ ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+ (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+ (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+ (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+ (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+ (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+ (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
// Checks performed when dst is configured
- if((dst != nullptr) && (dst->total_size() != 0))
+ if ((dst != nullptr) && (dst->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -176,7 +176,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
// Configure kernel window
Window win = calculate_max_window(*src, Steps());
- if(dst != nullptr)
+ if (dst != nullptr)
{
// dst auto inizialitation if not yet initialized
auto_init_if_empty(*dst, *src->clone());
@@ -185,14 +185,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
return std::make_pair(Status{}, win);
}
#ifdef __aarch64__
-void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_type,
- const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out,
- ActivationLayerInfo::LookupTable256 &lut, float a, float b)
+void init_lut(ActivationLayerInfo::ActivationFunction act_func,
+ DataType data_type,
+ const UniformQuantizationInfo &qi_in,
+ const UniformQuantizationInfo &qi_out,
+ ActivationLayerInfo::LookupTable256 &lut,
+ float a,
+ float b)
{
- for(size_t i = 0; i < lut.size(); ++i)
+ for (size_t i = 0; i < lut.size(); ++i)
{
- float tmp_f = (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in);
- switch(act_func)
+ float tmp_f =
+ (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in);
+ switch (act_func)
{
case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
@@ -246,7 +251,8 @@ void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_ty
tmp_f = 0;
break;
}
- lut[i] = (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out);
+ lut[i] =
+ (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out);
}
}
#endif // __aarch64__
@@ -258,8 +264,9 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
ARM_COMPUTE_ERROR_ON_NULLPTR(src);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
- const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() });
- if(dst != nullptr)
+ const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
+ if (dst != nullptr)
{
// dst auto inizialitation if not yet initialized
auto_init_if_empty(*dst, *src->clone());
@@ -271,11 +278,12 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
_name = std::string("CpuActivationKernel").append("/").append(uk->name);
#ifdef __aarch64__
- if(src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
+ if (src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
{
ActivationLayerInfo::LookupTable256 tmp_lut;
- init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(),
- tmp_lut, activation_info.a(), activation_info.b());
+ init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(),
+ (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), tmp_lut,
+ activation_info.a(), activation_info.b());
activation_info.setLookupTable256(tmp_lut);
}
#endif // __aarch64__
@@ -288,11 +296,13 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
ICPPKernel::configure(win);
}
-Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status
+CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
return Status{};
}
@@ -302,7 +312,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count
ARM_COMPUTE_UNUSED(thread_count);
ARM_COMPUTE_UNUSED(platform);
- if(_split_dimension == Window::DimX)
+ if (_split_dimension == Window::DimX)
{
// Don't split the work load too small if the tensor has been reinterpreted as 1D.
// This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -314,7 +324,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count
void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
// Early exit on disabled activation
- if(!_act_info.enabled())
+ if (!_act_info.enabled())
{
return;
}
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index 804407653f..4bad9fb3e8 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -38,7 +39,8 @@ namespace kernels
class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
{
private:
- using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+ using ActivationKernelPtr =
+ std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
public:
CpuActivationKernel() = default;
@@ -71,7 +73,7 @@ public:
size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
/** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
@@ -94,8 +96,8 @@ public:
private:
ActivationLayerInfo _act_info{};
- ActivationKernelPtr _run_method{ nullptr };
- size_t _split_dimension{ Window::DimY };
+ ActivationKernelPtr _run_method{nullptr};
+ size_t _split_dimension{Window::DimY};
std::string _name{};
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index 2983575cb6..a990aa4715 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp
@@ -26,19 +26,21 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/add/list.h"
+
#include <array>
#if defined(ENABLE_FP32_KERNELS)
namespace
{
- static constexpr size_t default_mws_N1_fp32_neon = 24536;
- static constexpr size_t default_mws_V1_fp32_neon = 40510;
-}
+static constexpr size_t default_mws_N1_fp32_neon = 24536;
+static constexpr size_t default_mws_V1_fp32_neon = 40510;
+} // namespace
#endif /* ENABLE_FP32_KERNELS */
namespace arm_compute
@@ -49,152 +51,82 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuAddKernel::AddKernel> available_kernels =
-{
- {
- "neon_qu8_add_fixedpoint",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)
- },
- {
- "neon_qs8_add_fixedpoint",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)
- },
- {
- "sve2_qu8_add",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::QASYMM8) && data.isa.sve2;
- },
- REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)
- },
- {
- "sve2_qs8_add",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2;
- },
- REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)
- },
- {
- "sve2_qs16_add",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::QSYMM16) && data.isa.sve2;
- },
- REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)
- },
- {
- "sve_fp32_add",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::F32) && data.isa.sve;
- },
- REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)
- },
- {
- "sve_fp16_add",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16;
- },
- REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)
- },
- {
- "sve_u8_add",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::U8) && data.isa.sve;
- },
- REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)
- },
- {
- "sve_s16_add",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::S16) && data.isa.sve;
- },
- REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)
- },
- {
- "sve_s32_add",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::S32) && data.isa.sve;
- },
- REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)
- },
- {
- "neon_fp32_add",
- [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)
- },
- {
- "neon_fp16_add",
- [](const CpuAddKernelDataTypeISASelectorData & data)
- {
- return (data.dt == DataType::F16) && data.isa.fp16;
- },
- REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)
- },
- {
- "neon_u8_add",
- [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)
- },
- {
- "neon_s16_add",
- [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)
- },
- {
- "neon_s32_add",
- [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)
- },
- {
- "neon_qu8_add",
- [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
- },
- {
- "neon_qs8_add",
- [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
- },
- {
- "neon_qs16_add",
- [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
- }
-};
-
-Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+static const std::vector<CpuAddKernel::AddKernel> available_kernels = {
+ {"neon_qu8_add_fixedpoint",
+ [](const CpuAddKernelDataTypeISASelectorData &data)
+ { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; },
+ REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)},
+ {"neon_qs8_add_fixedpoint",
+ [](const CpuAddKernelDataTypeISASelectorData &data)
+ { return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; },
+ REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)},
+ {"sve2_qu8_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+ REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)},
+ {"sve2_qs8_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data)
+ { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+ REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)},
+ {"sve2_qs16_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16) && data.isa.sve2; },
+ REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)},
+ {"sve_fp32_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+ REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)},
+ {"sve_fp16_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data)
+ { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+ REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)},
+ {"sve_u8_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8) && data.isa.sve; },
+ REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)},
+ {"sve_s16_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16) && data.isa.sve; },
+ REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)},
+ {"sve_s32_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32) && data.isa.sve; },
+ REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)},
+ {"neon_fp32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)},
+ {"neon_fp16_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)},
+ {"neon_u8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)},
+ {"neon_s16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)},
+ {"neon_s32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)},
+ {"neon_qu8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)},
+ {"neon_qs8_add",
+ [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)},
+ {"neon_qs16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)}};
+
+Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
{
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16, DataType::F16,
- DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+ DataType::F16, DataType::S32, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type())
- || (src1.data_type() != dst.data_type())),
- "Broadcasting across width is supported on configurations where all tensors have the same data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (src0.tensor_shape().x() != src1.tensor_shape().x()) &&
+ ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) ||
+ (src1.data_type() != dst.data_type())),
+ "Broadcasting across width is supported on configurations where all tensors have the same data type");
// Validate in case of configured dst
- if(dst.total_size() > 0)
+ if (dst.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
@@ -202,8 +134,8 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons
}
const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
- const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0.data_type(),
- CPUInfo::get().get_isa(), can_use_fixedpoint });
+ const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+ CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
return Status{};
@@ -215,9 +147,9 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
- const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst);
- const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0->data_type(),
- CPUInfo::get().get_isa(), can_use_fixedpoint });
+ const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst);
+ const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+ CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
@@ -237,7 +169,8 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
ICpuKernel::configure(win);
}
-Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+Status
+CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
@@ -277,14 +210,14 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
ARM_COMPUTE_UNUSED(thread_count);
#if defined(ENABLE_FP32_KERNELS)
- if(this->_run_method == &add_fp32_neon)
+ if (this->_run_method == &add_fp32_neon)
{
size_t mws = ICPPKernel::default_mws;
- if(platform.get_cpu_model() == CPUModel::N1)
+ if (platform.get_cpu_model() == CPUModel::N1)
{
mws = default_mws_N1_fp32_neon;
}
- else if(platform.get_cpu_model() == CPUModel::V1)
+ else if (platform.get_cpu_model() == CPUModel::V1)
{
mws = default_mws_V1_fp32_neon;
}
@@ -294,7 +227,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
}
// tensor is 1D or was re-interpreted as 1D
- if(this->window().shape().num_dimensions() == 1)
+ if (this->window().shape().num_dimensions() == 1)
{
return mws;
}
@@ -307,7 +240,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
return std::max(static_cast<size_t>(1), mws);
}
}
-#else /* ENABLE_FP32_KERNELS */
+#else /* ENABLE_FP32_KERNELS */
ARM_COMPUTE_UNUSED(platform);
#endif /* ENABLE_FP32_KERNELS */
return ICPPKernel::default_mws;
diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h
index 9921feabe2..4adba8bb16 100644
--- a/src/cpu/kernels/CpuAddKernel.h
+++ b/src/cpu/kernels/CpuAddKernel.h
@@ -37,7 +37,8 @@ namespace kernels
class CpuAddKernel : public ICpuKernel<CpuAddKernel>
{
private:
- using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+ using AddKernelPtr = std::add_pointer<void(
+ const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
public:
struct AddKernel
@@ -74,10 +75,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+ static Status
+ validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
/** Return minimum workload size of the relevant kernel
@@ -98,9 +100,9 @@ public:
private:
ConvertPolicy _policy{};
- AddKernelPtr _run_method{ nullptr };
+ AddKernelPtr _run_method{nullptr};
std::string _name{};
- size_t _split_dimension{ Window::DimY };
+ size_t _split_dimension{Window::DimY};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuAddMulAddKernel.cpp b/src/cpu/kernels/CpuAddMulAddKernel.cpp
index b84bdd54e9..6a632e8702 100644
--- a/src/cpu/kernels/CpuAddMulAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddMulAddKernel.cpp
@@ -27,8 +27,8 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/addmuladd/list.h"
@@ -41,36 +41,28 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels =
-{
+static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels = {
#ifdef __aarch64__
- {
- "neon_fp32_add_mul_add",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)
- },
- {
- "neon_fp16_add_mul_add",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); },
- REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)
- },
- {
- "neon_qasymm8_add_mul_add",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)
- },
- {
- "neon_qasymm8_signed_add_mul_add",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)
- }
+ {"neon_fp32_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)},
+ {"neon_fp16_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16); },
+ REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)},
+ {"neon_qasymm8_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)},
+ {"neon_qasymm8_signed_add_mul_add",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)}
#endif // __aarch64__
};
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- const ITensorInfo *add_output, const ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
@@ -78,16 +70,16 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
using ActFunction = ActivationLayerInfo::ActivationFunction;
const ActFunction act_func = act_info.activation();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- (act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU && act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY),
- "Only RELU Family activations, or no activation, is supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU &&
+ act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY),
+ "Only RELU Family activations, or no activation, is supported");
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
- if(is_data_type_quantized(input1->data_type()))
+ if (is_data_type_quantized(input1->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_mul, 1, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_add, 1, DataType::F32);
@@ -101,39 +93,47 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2); // No broadcasting
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mul, bn_add);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->num_dimensions() != 1, "BatchNorm coefficients should be 1D array");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0], "First dimensions of inputs and batchNorm coefs should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0],
+ "First dimensions of inputs and batchNorm coefs should match");
// Validate in case we have add layer's output (intermediate) initialized
- if(add_output != nullptr && add_output->total_size() > 0)
+ if (add_output != nullptr && add_output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, add_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, add_output);
}
// Validate in case final output has been initialized
- if(final_output->total_size() > 0)
+ if (final_output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, final_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, final_output);
}
- const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() });
+ const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+ DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
return Status{};
}
} // namespace
-void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- ITensorInfo *add_output, ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAddMulAddKernel::configure(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ ITensorInfo *add_output,
+ ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(bn_mul, bn_add, input2);
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, bn_add, bn_mul, final_output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
- const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() });
+ const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+ DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
@@ -146,7 +146,7 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo
set_shape_if_empty(*final_output, input1->tensor_shape());
set_data_type_if_unknown(*final_output, input1->data_type());
- if(add_output != nullptr)
+ if (add_output != nullptr)
{
set_shape_if_empty(*add_output, input1->tensor_shape());
set_data_type_if_unknown(*add_output, input1->data_type());
@@ -158,14 +158,19 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo
ICpuKernel::configure(win);
}
-Status CpuAddMulAddKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- const ITensorInfo *add_output, const ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAddMulAddKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
return Status{};
}
diff --git a/src/cpu/kernels/CpuAddMulAddKernel.h b/src/cpu/kernels/CpuAddMulAddKernel.h
index 67ce6f029a..c5e31ec291 100644
--- a/src/cpu/kernels/CpuAddMulAddKernel.h
+++ b/src/cpu/kernels/CpuAddMulAddKernel.h
@@ -26,6 +26,7 @@
#define SRC_CPU_KERNELS_CPUADDMULADDKERNEL
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -39,8 +40,15 @@ namespace kernels
class CpuAddMulAddKernel : public ICpuKernel<CpuAddMulAddKernel>
{
private:
- using AddMulAddKernelPtr =
- std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, const ITensor *, ITensor *, ITensor *, ConvertPolicy, const ActivationLayerInfo &, const Window &)>::type;
+ using AddMulAddKernelPtr = std::add_pointer<void(const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ ITensor *,
+ ITensor *,
+ ConvertPolicy,
+ const ActivationLayerInfo &,
+ const Window &)>::type;
public:
struct AddMulAddKernel
@@ -57,23 +65,31 @@ public:
* Similar to @ref NEAddMulAdd::configure()
*
*/
- void configure(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- ITensorInfo *add_output, ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info);
+ void configure(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ ITensorInfo *add_output,
+ ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuAddMulAddKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- const ITensorInfo *add_output, const ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
static const std::vector<AddMulAddKernel> &get_available_kernels();
@@ -81,7 +97,7 @@ public:
private:
ConvertPolicy _policy{};
ActivationLayerInfo _act_info{};
- AddMulAddKernelPtr _run_method{ nullptr };
+ AddMulAddKernelPtr _run_method{nullptr};
std::string _name{};
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp
index 764a1ec71c..05c7742b03 100644
--- a/src/cpu/kernels/CpuCastKernel.cpp
+++ b/src/cpu/kernels/CpuCastKernel.cpp
@@ -28,16 +28,16 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "src/core/NEON/NEFixedPoint.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/SaturateCast.h"
-
#include "src/cpu/kernels/cast/list.h"
+#include "support/SaturateCast.h"
namespace arm_compute
{
@@ -47,38 +47,30 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuCastKernel::CastKernel> available_kernels =
-{
- {
- "neon_qs8_cast",
- [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)
- },
- {
- "neon_qu8_cast",
- [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
- },
- {
- "neon_u8_cast",
- [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
- },
- {
- "neon_fp16_cast",
- [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)
- },
- {
- "neon_fp32_to_fp16_cast",
- [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)
- },
- {
- "neon_s32_cast",
- [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)
- },
+static const std::vector<CpuCastKernel::CastKernel> available_kernels = {
+ {"neon_qs8_cast",
+ [](const CastDataTypeISASelectorData &data)
+ { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)},
+ {"neon_qu8_cast",
+ [](const CastDataTypeISASelectorData &data)
+ { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+ {"neon_u8_cast",
+ [](const CastDataTypeISASelectorData &data)
+ { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+ {"neon_fp16_cast",
+ [](const CastDataTypeISASelectorData &data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)},
+ {"neon_fp32_to_fp16_cast",
+ [](const CastDataTypeISASelectorData &data)
+ { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)},
+ {"neon_s32_cast",
+ [](const CastDataTypeISASelectorData &data)
+ { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)},
};
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
@@ -88,57 +80,67 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
#ifdef __aarch64__
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
- DataType::S16, DataType::U16, DataType::F16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::U8, DataType::S16, DataType::U16, DataType::F16,
DataType::F32, DataType::S32, DataType::S64, DataType::U64);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
- DataType::S16, DataType::U16, DataType::F16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::U8, DataType::S16, DataType::U16, DataType::F16,
DataType::U32, DataType::S32, DataType::F32, DataType::S64);
#else // __aarch64__
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
- DataType::S16, DataType::U16, DataType::F16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::U8, DataType::S16, DataType::U16, DataType::F16,
DataType::F32, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
- DataType::S16, DataType::U16, DataType::F16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::U8, DataType::S16, DataType::U16, DataType::F16,
DataType::U32, DataType::S32, DataType::F32);
#endif // __aarch64__
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
- && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED &&
+ (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 &&
+ dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
"Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
- && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 &&
+ (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+ dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+ dst->data_type() != DataType::F32),
"Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
- && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 &&
+ (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+ dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+ dst->data_type() != DataType::F32),
"Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 &&
+ (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
"Only data_types supported [in] U16 -> [out] U8, U32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 &&
+ (dst->data_type() != DataType::QASYMM8_SIGNED &&
+ dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
"Only data_types supported [in] S16 -> [out] U8, S32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
- && dst->data_type() != DataType::U8
- && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 &&
+ (dst->data_type() != DataType::QASYMM8_SIGNED &&
+ dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::U8 &&
+ dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
"Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
- && dst->data_type() != DataType::F16
- && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 &&
+ (dst->data_type() != DataType::QASYMM8_SIGNED &&
+ dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+ dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
"Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
- && dst->data_type() != DataType::F16
- && dst->data_type() != DataType::F32
- && dst->data_type() != DataType::U8
- && dst->data_type() != DataType::S64),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 &&
+ (dst->data_type() != DataType::QASYMM8_SIGNED &&
+ dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+ dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8 &&
+ dst->data_type() != DataType::S64),
"Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8, S64");
#ifdef __aarch64__
ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32,
@@ -149,7 +151,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
#endif // __aarch64__
// Validate in case of configured dst
- if(dst->total_size() > 0)
+ if (dst->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
}
@@ -193,15 +195,8 @@ inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr)
template <>
inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int64_t *dst_ptr)
{
- const int32x4x4_t texels =
- {
- {
- vld1q_s32(src_ptr),
- vld1q_s32(src_ptr + 4),
- vld1q_s32(src_ptr + 8),
- vld1q_s32(src_ptr + 12)
- }
- };
+ const int32x4x4_t texels = {
+ {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}};
vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0])));
vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0])));
vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1])));
@@ -215,33 +210,14 @@ inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int6
template <>
inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr)
{
- const float64x2x4_t texels0 =
- {
- {
- vcvtq_f64_s64(vld1q_s64(src_ptr)),
- vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
- vcvtq_f64_s64(vld1q_s64(src_ptr + 4)),
- vcvtq_f64_s64(vld1q_s64(src_ptr + 6))
- }
- };
- const float64x2x4_t texels1 =
- {
- {
- vcvtq_f64_s64(vld1q_s64(src_ptr + 8)),
- vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
- vcvtq_f64_s64(vld1q_s64(src_ptr + 12)),
- vcvtq_f64_s64(vld1q_s64(src_ptr + 14))
- }
- };
- const float32x4x4_t texels =
- {
- {
- vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
- vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
- vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
- vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
- }
- };
+ const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
+ vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}};
+ const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
+ vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}};
+ const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+ vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+ vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+ vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
vst1q_f32(dst_ptr, texels.val[0]);
vst1q_f32(dst_ptr + 4, texels.val[1]);
vst1q_f32(dst_ptr + 8, texels.val[2]);
@@ -251,34 +227,15 @@ inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float
template <>
inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, float *dst_ptr)
{
- const float64x2x4_t texels0 =
- {
- {
- vcvtq_f64_u64(vld1q_u64(src_ptr)),
- vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
- vcvtq_f64_u64(vld1q_u64(src_ptr + 4)),
- vcvtq_f64_u64(vld1q_u64(src_ptr + 6))
- }
- };
- const float64x2x4_t texels1 =
- {
- {
- vcvtq_f64_u64(vld1q_u64(src_ptr + 8)),
- vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
- vcvtq_f64_u64(vld1q_u64(src_ptr + 12)),
- vcvtq_f64_u64(vld1q_u64(src_ptr + 14))
- }
- };
+ const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
+ vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}};
+ const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
+ vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}};
- const float32x4x4_t texels =
- {
- {
- vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
- vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
- vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
- vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
- }
- };
+ const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+ vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+ vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+ vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
vst1q_f32(dst_ptr, texels.val[0]);
vst1q_f32(dst_ptr + 4, texels.val[1]);
@@ -287,23 +244,26 @@ inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, floa
}
template <typename T1, typename T2>
-inline void convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
+inline void
+convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
- }
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
- }
- },
- src, dst);
+ const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
} // namespace
#endif // __aarch64__
@@ -325,21 +285,22 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
/*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */
- const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() });
+ const auto *uk = CpuCastKernel::get_implementation(
+ CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()});
- switch(_src->info()->data_type())
+ switch (_src->info()->data_type())
{
#ifdef __aarch64__
case DataType::U64:
{
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
case DataType::F32:
{
@@ -353,7 +314,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
}
case DataType::S64:
{
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
case DataType::F32:
{
@@ -369,111 +330,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
case DataType::QASYMM8_SIGNED:
{
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
case DataType::S16:
{
/* Up-conversion QASYMM8_SIGNED -> S16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+ int x = window_start_x;
- const int16x8x2_t texels =
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vmovl_s8(vget_low_s8(texels_s8)),
- vmovl_s8(vget_high_s8(texels_s8))
- }
- };
+ const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
- vst1q_s16(dst_ptr + x, texels.val[0]);
- vst1q_s16(dst_ptr + x + 8, texels.val[1]);
- }
+ const int16x8x2_t texels = {
+ {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_s16(dst_ptr + x, texels.val[0]);
+ vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::S32:
{
/* Up-conversion QASYMM8_SIGNED -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+ int x = window_start_x;
- const int16x8x2_t texels =
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vmovl_s8(vget_low_s8(texels_s8)),
- vmovl_s8(vget_high_s8(texels_s8))
- }
- };
+ const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
- vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
- vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
- vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
- vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
- }
+ const int16x8x2_t texels = {
+ {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+ vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+ vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+ vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::F32:
{
/* Up-conversion QASYMM8_SIGNED -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
- const int16x8x2_t texels =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vmovl_s8(vget_low_s8(texels_s8)),
- vmovl_s8(vget_high_s8(texels_s8))
- }
- };
- vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
- vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
- vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
- vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
- }
- },
- src, dst);
+ const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+
+ const int16x8x2_t texels = {
+ {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+ vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+ vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+ vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+ vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::F16:
@@ -492,111 +444,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
case DataType::QASYMM8:
case DataType::U8:
{
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
case DataType::S16:
{
/* Up-conversion U8 -> S16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
- const int16x8x2_t texels =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
- }
- };
+ const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
- vst1q_s16(dst_ptr + x, texels.val[0]);
- vst1q_s16(dst_ptr + x + 8, texels.val[1]);
- }
+ const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+ vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_s16(dst_ptr + x, texels.val[0]);
+ vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::S32:
{
/* Up-conversion U8 -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
- const int16x8x2_t texels =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
- }
- };
+ const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
- vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
- vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
- vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
- vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
- }
+ const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+ vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+ vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+ vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+ vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::F32:
{
/* Up-conversion U8 -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
- const int16x8x2_t texels =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
- }
- };
- vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
- vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
- vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
- vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+
+ const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+ vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+ vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+ vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+ vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+ vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::F16:
@@ -609,35 +552,32 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
case DataType::U16:
{
/* Up-conversion U8 -> U16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
- const uint16x8x2_t texels =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vmovl_u8(vget_low_u8(texels_u8)),
- vmovl_u8(vget_high_u8(texels_u8))
- }
- };
+ const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
- vst1q_u16(dst_ptr + x, texels.val[0]);
- vst1q_u16(dst_ptr + x + 8, texels.val[1]);
- }
+ const uint16x8x2_t texels = {
+ {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_u16(dst_ptr + x, texels.val[0]);
+ vst1q_u16(dst_ptr + x + 8, texels.val[1]);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
default:
@@ -647,177 +587,154 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
}
case DataType::S16:
{
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
case DataType::QASYMM8_SIGNED:
{
/* Down-conversion S16 -> QASYMM8_SIGNED */
- if(ConvertPolicy::SATURATE == _policy)
+ if (ConvertPolicy::SATURATE == _policy)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
+ const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
- vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
- }
+ vst1q_s8(dst_ptr + x,
+ vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
+ const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
- vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
- }
+ vst1q_s8(dst_ptr + x,
+ vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
break;
}
case DataType::U8:
{
/* Down-conversion S16 -> U8 */
- if(ConvertPolicy::SATURATE == _policy)
+ if (ConvertPolicy::SATURATE == _policy)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
+ const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
- vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
- }
+ vst1q_u8(dst_ptr + x,
+ vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
-
- vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
- vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
- }
+ const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
+ vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
break;
}
case DataType::S32:
{
/* Up-conversion S16 -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int16x8x2_t texels =
- {
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
+ const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
- const int32x4x4_t texels_s32 =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vmovl_s16(vget_low_s16(texels.val[0])),
- vmovl_s16(vget_high_s16(texels.val[0])),
- vmovl_s16(vget_low_s16(texels.val[1])),
- vmovl_s16(vget_high_s16(texels.val[1]))
- }
- };
+ const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
- vst1q_s32(dst_ptr + x, texels_s32.val[0]);
- vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
- vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
- vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
- }
+ const int32x4x4_t texels_s32 = {
+ {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])),
+ vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_s32(dst_ptr + x, texels_s32.val[0]);
+ vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
+ vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
+ vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
default:
@@ -828,104 +745,92 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
case DataType::U16:
{
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
case DataType::U8:
{
/* Down-conversion U16 -> U8 */
- if(ConvertPolicy::SATURATE == _policy)
+ if (ConvertPolicy::SATURATE == _policy)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const uint16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_u16(src_ptr + x),
- vld1q_u16(src_ptr + x + 8)
- }
- };
+ const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
- vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
- }
+ vst1q_u8(dst_ptr + x,
+ vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const uint16x8x2_t texels =
- {
- {
- vld1q_u16(src_ptr + x),
- vld1q_u16(src_ptr + x + 8)
- }
- };
+ const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
- vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
- }
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
- }
+ vst1q_u8(dst_ptr + x,
+ vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
+ }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
break;
}
case DataType::U32:
{
/* Up-conversion U16 -> U32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const uint16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_u16(src_ptr + x),
- vld1q_u16(src_ptr + x + 8)
- }
- };
-
- vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
- vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
- vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
- vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
- }
+ const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
- },
- src, dst);
+ vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
+ vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
+ vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
+ vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
default:
@@ -941,7 +846,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
break;
}
case DataType::F32:
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
case DataType::F16:
{
@@ -953,105 +858,110 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
case DataType::S32:
{
/* Conversion F32 -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const float32x4x4_t texels = {{
vld1q_f32(src_ptr + x),
vld1q_f32(src_ptr + x + 4),
vld1q_f32(src_ptr + x + 8),
vld1q_f32(src_ptr + x + 12),
- }
- };
+ }};
- vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
- vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
- vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
- vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
- }
+ vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
+ vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
+ vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
+ vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::QASYMM8:
case DataType::U8:
{
/* Down-conversion F32 -> U8 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const float32x4x4_t texels = {{
vld1q_f32(src_ptr + x),
vld1q_f32(src_ptr + x + 4),
vld1q_f32(src_ptr + x + 8),
vld1q_f32(src_ptr + x + 12),
- }
- };
-
- vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
- vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
- }
+ }};
+
+ vst1_u8(dst_ptr + x,
+ vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])),
+ vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
+ vst1_u8(dst_ptr + x + 8,
+ vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])),
+ vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::QASYMM8_SIGNED:
{
/* Down-conversion F32 -> QASYMM8_SIGNED */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const float32x4x4_t texels = {{
vld1q_f32(src_ptr + x),
vld1q_f32(src_ptr + x + 4),
vld1q_f32(src_ptr + x + 8),
vld1q_f32(src_ptr + x + 12),
- }
- };
-
- vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
- vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ }};
+
+ vst1_s8(dst_ptr + x,
+ vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])),
+ vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
+ vst1_s8(dst_ptr + x + 8,
+ vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])),
+ vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
@@ -1060,7 +970,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
}
break;
case DataType::S32:
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
#if __aarch64__
case DataType::S64:
@@ -1079,104 +989,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
case DataType::F32:
{
/* Conversion S32 -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const int32x4x4_t texels = {{
vld1q_s32(src_ptr + x),
vld1q_s32(src_ptr + x + 4),
vld1q_s32(src_ptr + x + 8),
vld1q_s32(src_ptr + x + 12),
- }
- };
+ }};
- vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
- vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
- vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
- vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
- }
+ vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
+ vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
+ vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
+ vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::QASYMM8_SIGNED:
{
/* Down-conversion S32 -> QASYMM8_SIGNED */
- if(ConvertPolicy::SATURATE == _policy)
+ if (ConvertPolicy::SATURATE == _policy)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const int32x4x4_t texels = {{
vld1q_s32(src_ptr + x),
vld1q_s32(src_ptr + x + 4),
vld1q_s32(src_ptr + x + 8),
vld1q_s32(src_ptr + x + 12),
- }
- };
- vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
- vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
- }
+ }};
+ vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]),
+ vqmovn_s32(texels.val[1]))));
+ vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]),
+ vqmovn_s32(texels.val[3]))));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int32x4x4_t texels =
- {
- {
- vld1q_s32(src_ptr + x),
- vld1q_s32(src_ptr + x + 4),
- vld1q_s32(src_ptr + x + 8),
- vld1q_s32(src_ptr + x + 12)
- }
- };
+ const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
- vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
- vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
- }
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+ vld1q_s32(src_ptr + x + 8),
+ vld1q_s32(src_ptr + x + 12)}};
+
+ vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]),
+ vmovn_s32(texels.val[1]))));
+ vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]),
+ vmovn_s32(texels.val[3]))));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
break;
}
@@ -1184,68 +1092,66 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
case DataType::U8:
{
/* Down-conversion S32 -> U8 */
- if(ConvertPolicy::SATURATE == _policy)
+ if (ConvertPolicy::SATURATE == _policy)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(src_ptr + x),
- vld1q_s32(src_ptr + x + 4),
- vld1q_s32(src_ptr + x + 8),
- vld1q_s32(src_ptr + x + 12)
- }
- };
- vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
- vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
- }
+ const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+ vld1q_s32(src_ptr + x + 8),
+ vld1q_s32(src_ptr + x + 12)}};
+ vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]),
+ vqmovun_s32(texels.val[1]))));
+ vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]),
+ vqmovun_s32(texels.val[3]))));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int32x4x4_t texels =
- {
- {
- vld1q_s32(src_ptr + x),
- vld1q_s32(src_ptr + x + 4),
- vld1q_s32(src_ptr + x + 8),
- vld1q_s32(src_ptr + x + 12)
- }
- };
+ const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
- vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
- vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
- }
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+ vld1q_s32(src_ptr + x + 8),
+ vld1q_s32(src_ptr + x + 12)}};
+
+ vst1_u8(dst_ptr + x,
+ vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),
+ vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
+ vst1_u8(dst_ptr + x + 8,
+ vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),
+ vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
break;
}
diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h
index a7e6417ff2..ddbfe1f034 100644
--- a/src/cpu/kernels/CpuCastKernel.h
+++ b/src/cpu/kernels/CpuCastKernel.h
@@ -40,7 +40,8 @@ namespace kernels
class CpuCastKernel : public ICpuKernel<CpuCastKernel>
{
private:
- using CastKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type;
+ using CastKernelPtr =
+ std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type;
public:
CpuCastKernel() = default;
@@ -76,7 +77,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct CastKernel
@@ -89,7 +90,7 @@ public:
static const std::vector<CastKernel> &get_available_kernels();
private:
- ConvertPolicy _policy{ ConvertPolicy::SATURATE };
+ ConvertPolicy _policy{ConvertPolicy::SATURATE};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuCol2ImKernel.cpp b/src/cpu/kernels/CpuCol2ImKernel.cpp
index bf5a44d78b..a52a1f58ea 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.cpp
+++ b/src/cpu/kernels/CpuCol2ImKernel.cpp
@@ -29,8 +29,9 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -49,9 +50,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
// Validate configured output
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, false));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+ compute_col2im_shape(*src, convolved_dims, false));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
}
@@ -106,13 +108,16 @@ void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const T
Iterator in(src, window);
Iterator out(dst, window_out);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int hidx = id.y();
- const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x;
- std::memcpy(out.ptr() + idx, in.ptr(), el_size);
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int hidx = id.y();
+ const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y +
+ (hidx % _convolved_dims.width) * output_stride_x;
+ std::memcpy(out.ptr() + idx, in.ptr(), el_size);
+ },
+ in, out);
}
const char *CpuCol2ImKernel::name() const
@@ -121,4 +126,4 @@ const char *CpuCol2ImKernel::name() const
}
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h
index deafcc14df..3e394ac914 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.h
+++ b/src/cpu/kernels/CpuCol2ImKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_COL2IM_KERNEL_H
#include "arm_compute/core/Size2D.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -75,7 +76,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
index 29d40f0e52..8c290173e8 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
@@ -30,10 +30,11 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
@@ -50,13 +51,14 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c
uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
// Offset dst
- uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3];
+ uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+ batch_offset * dst->info()->strides_in_bytes()[3];
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
const int window_step_x = 16 / dst->info()->element_size();
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1));
@@ -66,66 +68,74 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c
const DataType dt = src->info()->data_type();
const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
- if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+ if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+ const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+ }
+ },
+ src_it, dst_it);
}
- else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+ else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
+ const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+ const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr,
+ vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) =
+ quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+ }
+ },
+ src_it, dst_it);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- *(out_ptr + x) = *(in_ptr + x);
- }
- },
- src_it, dst_it);
+ const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+ const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) = *(in_ptr + x);
+ }
+ },
+ src_it, dst_it);
}
}
@@ -154,7 +164,7 @@ void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int b
_func = nullptr;
_batch_offset = batch_offset;
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::S8:
case DataType::U8:
@@ -196,9 +206,7 @@ void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &windo
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
- tensors.get_tensor(TensorType::ACL_DST),
- _batch_offset,
+ (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _batch_offset,
window);
}
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h
index 0de68a5d64..52ea553a7d 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.h
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h
@@ -57,15 +57,15 @@ public:
static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
private:
- BatchConcatFunction *_func{ nullptr };
- unsigned int _batch_offset{ 0 };
+ BatchConcatFunction *_func{nullptr};
+ unsigned int _batch_offset{0};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
index ebc5322aee..c75e1e4477 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
@@ -30,11 +30,12 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/NEFixedPoint.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
#include <cstdint>
@@ -53,13 +54,14 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c
uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
// Offset destination
- uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2];
+ uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+ depth_offset * dst->info()->strides_in_bytes()[2];
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
const int window_step_x = 16 / dst->info()->element_size();
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1));
@@ -69,64 +71,73 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c
const DataType dt = src->info()->data_type();
const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
- if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+ if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+ const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr + x,
+ vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+ }
+ },
+ src_it, dst_it);
}
- else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+ else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
+ const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+ const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr + x,
+ vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) =
+ quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+ }
+ },
+ src_it, dst_it);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- *(out_ptr + x) = *(in_ptr + x);
- }
- },
- src_it, dst_it);
+ const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+ const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) = *(in_ptr + x);
+ }
+ },
+ src_it, dst_it);
}
}
@@ -134,7 +145,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, c
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
@@ -154,7 +166,7 @@ void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int d
_func = nullptr;
_depth_offset = depth_offset;
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::QASYMM8:
_func = &depth_concat<uint8_t>;
@@ -192,9 +204,7 @@ void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &windo
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
- tensors.get_tensor(TensorType::ACL_DST),
- _depth_offset,
+ (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _depth_offset,
window);
}
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h
index 5a0edb95bb..54de9aff46 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h
@@ -65,15 +65,15 @@ public:
static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
private:
- DepthConcatFunction *_func{ nullptr };
- unsigned int _depth_offset{ 0 };
+ DepthConcatFunction *_func{nullptr};
+ unsigned int _depth_offset{0};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
index 47a2b44443..b6c11d948b 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
@@ -30,10 +30,11 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <cstdint>
@@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
}
@@ -91,13 +92,14 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind
auto dst = tensors.get_tensor(TensorType::ACL_DST);
// Offset destination pointer to the correct position
- uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
+ uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+ _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
const int window_step_x = 16;
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1));
@@ -108,64 +110,74 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind
const DataType dt = src->info()->data_type();
const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
- if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+ if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
- }
-
- },
- src_it, dst_it);
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ vst1q_u8(dst_ptr + dst_it.offset() + x,
+ vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + dst_it.offset() + x) =
+ quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+ }
+ },
+ src_it, dst_it);
}
- else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+ else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
- vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ vst1q_s8(
+ reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+ vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo),
+ dst_qinfo));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + dst_it.offset() + x) =
+ quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+ }
+ },
+ src_it, dst_it);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = src_it.ptr();
- const auto out_ptr = dst_ptr + dst_it.offset();
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- *(out_ptr + x) = *(in_ptr + x);
- }
- },
- src_it, dst_it);
+ const auto in_ptr = src_it.ptr();
+ const auto out_ptr = dst_ptr + dst_it.offset();
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) = *(in_ptr + x);
+ }
+ },
+ src_it, dst_it);
}
}
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h
index 74d5d0c2c3..df880c4878 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.h
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h
@@ -58,11 +58,11 @@ public:
static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
- unsigned int _height_offset{ 0 };
+ unsigned int _height_offset{0};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
index f00b37a01b..f6100cccca 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
@@ -24,12 +24,12 @@
#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Steps.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Steps.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
namespace arm_compute
{
@@ -47,7 +47,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
- for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+ for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
}
@@ -86,13 +86,14 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo
auto dst = tensors.get_tensor(TensorType::ACL_DST);
// Offset output pointer to the correct position
- uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0];
+ uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+ _width_offset * dst->info()->strides_in_bytes()[0];
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
constexpr int window_step_x = 16;
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
// Create iterators
@@ -101,62 +102,73 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo
const DataType dt = src->info()->data_type();
const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
- if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+ if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ vst1q_u8(dst_ptr + dst_it.offset() + x,
+ vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + dst_it.offset() + x) =
+ quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+ }
+ },
+ src_it, dst_it);
}
- else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+ else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
- vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ vst1q_s8(
+ reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+ vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo),
+ dst_qinfo));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + dst_it.offset() + x) =
+ quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+ }
+ },
+ src_it, dst_it);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = src_it.ptr();
- const auto out_ptr = dst_ptr + dst_it.offset();
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- *(out_ptr + x) = *(in_ptr + x);
- }
- },
- src_it, dst_it);
+ const auto in_ptr = src_it.ptr();
+ const auto out_ptr = dst_ptr + dst_it.offset();
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) = *(in_ptr + x);
+ }
+ },
+ src_it, dst_it);
}
}
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h
index 418bc51b33..560e44e35a 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h
@@ -58,11 +58,11 @@ public:
static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
- unsigned int _width_offset{ 0 };
+ unsigned int _width_offset{0};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
index 08b39deef2..87703ec631 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -34,8 +35,10 @@ namespace cpu
{
namespace kernels
{
-void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape,
- DataLayout data_layout)
+void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -43,7 +46,8 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT
// Output tensor auto initialisation if not yet initialized
auto_init_if_empty(*dst, *src->clone());
- ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
@@ -62,8 +66,10 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT
ICpuKernel::configure(win);
}
-Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape,
- DataLayout data_layout)
+Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
@@ -72,7 +78,7 @@ Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, c
ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
// Checks performed when dst is configured
- if((dst != nullptr) && (dst->total_size() != 0))
+ if ((dst != nullptr) && (dst->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -97,11 +103,15 @@ void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const W
Iterator input(src, window);
Iterator output(dst, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size);
- },
- input);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ memcpy(output.ptr() + id.x() * dst_stride_x +
+ (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y,
+ input.ptr(), element_size);
+ },
+ input);
}
const char *CpuConvertFullyConnectedWeightsKernel::name() const
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
index 9a1393323b..2253889e69 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
@@ -53,24 +53,32 @@ public:
* @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
* @param[in] data_layout The data layout the weights have been trained in.
*/
- void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
+ void configure(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
- unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
- unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
+ unsigned int _factor1{
+ 0}; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
+ unsigned int _factor2{
+ 0}; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
};
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
index 1005d001ab..745b1566c2 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
@@ -29,9 +29,10 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
@@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
// Validate output if initialized
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
@@ -60,11 +61,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
{
// Output auto inizialitation if not yet initialized
{
- const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED;
- const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
- const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+ const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED;
+ const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
const int offset_correction = is_input_signed ? -128 : 128;
- const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
+ const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
}
@@ -110,27 +111,29 @@ void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Win
const uint8_t mask = 128;
const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- const uint8_t in = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
- *(output_ptr + x) = in ^ mask;
- }
- },
- input, output);
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const uint8_t in = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
+ *(output_ptr + x) = in ^ mask;
+ }
+ },
+ input, output);
}
const char *CpuConvertQuantizedSignednessKernel::name() const
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
index b5eaf65487..e94d3d5ef2 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
@@ -54,7 +54,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuCopyKernel.cpp b/src/cpu/kernels/CpuCopyKernel.cpp
index 3f0f3fe422..1b693d7a3a 100644
--- a/src/cpu/kernels/CpuCopyKernel.cpp
+++ b/src/cpu/kernels/CpuCopyKernel.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -48,9 +49,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
// Validate destination if initialized
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
}
@@ -64,7 +66,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
return std::make_pair(Status{}, calculate_max_window(*dst));
}
-std::pair<Status, Window> validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
+std::pair<Status, Window>
+validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
{
const TensorShape src_shape = src->tensor_shape();
const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding);
@@ -84,7 +87,7 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa
_padding = padding;
std::pair<Status, Window> win_config;
- if(padding.empty())
+ if (padding.empty())
{
win_config = validate_and_configure_window(src, dst);
}
@@ -97,17 +100,20 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa
ICpuKernel::configure(win_config.second);
}
-Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding)
+Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src,
+ const arm_compute::ITensorInfo *dst,
+ const PaddingList &padding)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding));
- if(padding.empty())
+ if (padding.empty())
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
}
return Status{};
@@ -122,38 +128,41 @@ void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- if(_padding.empty())
+ if (_padding.empty())
{
- Window dst_window{ window };
- dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
+ Window dst_window{window};
+ dst_window.set(Window::DimX,
+ Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
Window out_slice = dst_window.first_slice_window_1D();
do
{
Iterator src_it(src, out_slice);
Iterator dst_it(dst, out_slice);
- execute_window_loop(out_slice, [&](const Coordinates &)
- {
- memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size());
- },
- src_it, dst_it);
- }
- while(dst_window.slide_window_slice_1D(out_slice));
+ execute_window_loop(
+ out_slice,
+ [&](const Coordinates &)
+ { memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); },
+ src_it, dst_it);
+ } while (dst_window.slide_window_slice_1D(out_slice));
}
else
{
- Window src_window{ window };
- src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
+ Window src_window{window};
+ src_window.set(Window::DimX,
+ Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
Iterator src_it(src, src_window);
Iterator dst_it(dst, window);
const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size();
- execute_window_loop(window, [&](const Coordinates &)
- {
- auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
- std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
- },
- src_it, dst_it);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ {
+ auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
+ std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
+ },
+ src_it, dst_it);
}
}
diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h
index c9ef8eba76..a05053f07e 100644
--- a/src/cpu/kernels/CpuCopyKernel.h
+++ b/src/cpu/kernels/CpuCopyKernel.h
@@ -55,7 +55,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList());
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
index d6c56d2012..82e3a5ce00 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/traits.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
#include "src/cpu/kernels/depthwiseconv2d/list.h"
namespace arm_compute
@@ -41,72 +42,53 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels =
-{
- {
- "neon_qu8_deptwiseconv2dnative",
- [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
- {
- return (data.weights_dt == DataType::QASYMM8);
- },
- REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)
- },
- {
- "neon_qs8_deptwiseconv2dnative",
- [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
- {
- return (data.weights_dt == DataType::QASYMM8_SIGNED);
- },
- REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)
- },
- {
- "neon_fp16_deptwiseconv2dnative",
- [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
- {
- return (data.weights_dt == DataType::F16 && data.isa.fp16);
- },
- REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)
- },
- {
- "neon_fp32_deptwiseconv2dnative",
- [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
- {
- return (data.weights_dt == DataType::F32);
- },
- REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)
- },
- {
- "neon_qp8_qu8_deptwiseconv2dnative",
- [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
- {
- return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8);
- },
- REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)
- },
- {
- "neon_qp8_qs8_deptwiseconv2dnative",
- [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
- {
- return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8);
- },
- REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)
- },
+static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels = {
+ {"neon_qu8_deptwiseconv2dnative",
+ [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)},
+ {"neon_qs8_deptwiseconv2dnative",
+ [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+ { return (data.weights_dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)},
+ {"neon_fp16_deptwiseconv2dnative",
+ [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+ { return (data.weights_dt == DataType::F16 && data.isa.fp16); },
+ REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)},
+ {"neon_fp32_deptwiseconv2dnative",
+ [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::F32); },
+ REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)},
+ {"neon_qp8_qu8_deptwiseconv2dnative",
+ [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+ { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)},
+ {"neon_qp8_qs8_deptwiseconv2dnative",
+ [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+ { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); },
+ REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)},
};
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) >
+ src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) >
+ src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
- ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
+ ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) ||
+ (info.pad_stride_info.stride().second < 1));
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
@@ -116,12 +98,12 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
}
- if(biases != nullptr)
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
- if(is_data_type_quantized_asymmetric(src->data_type()))
+ if (is_data_type_quantized_asymmetric(src->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
@@ -131,9 +113,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
}
}
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+ const TensorShape output_shape =
+ misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
}
@@ -142,7 +125,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
}
} // namespace
-void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
@@ -151,18 +138,26 @@ void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITe
_conv_info = info;
const auto uk = CpuDepthwiseConv2dNativeKernel::get_implementation(
- DepthwiseConv2dNativeDataTypeISASelectorData{ weights->data_type(), src->data_type(), CPUInfo::get().get_isa() });
+ DepthwiseConv2dNativeDataTypeISASelectorData{weights->data_type(), src->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
_func = uk->ukernel;
const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
- auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
+ auto_init_if_empty(*dst, src->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(output_shape)
+ .set_quantization_info(dst->quantization_info()));
Window win = calculate_max_window(*dst, Steps());
ICpuKernel::configure(win);
}
-Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
return Status{};
@@ -187,7 +182,8 @@ const char *CpuDepthwiseConv2dNativeKernel::name() const
return "CpuDepthwiseConv2dNativeKernel";
}
-const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &CpuDepthwiseConv2dNativeKernel::get_available_kernels()
+const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &
+CpuDepthwiseConv2dNativeKernel::get_available_kernels()
{
return available_kernels;
}
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
index 9fabd0b01c..7e78f52e13 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/utils/misc/Traits.h"
#include "arm_compute/function_info/ConvolutionInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
#include "support/AclRequires.h"
@@ -44,8 +45,9 @@ namespace kernels
class CpuDepthwiseConv2dNativeKernel : public ICpuKernel<CpuDepthwiseConv2dNativeKernel>
{
private:
- using DepthwiseConv2dNativeKernelPtr =
- std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::type;
+ using DepthwiseConv2dNativeKernelPtr = std::add_pointer<void(
+ const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::
+ type;
public:
CpuDepthwiseConv2dNativeKernel() = default;
@@ -64,17 +66,25 @@ public:
* @param[in] info Depthwise convolution meta-data.
*
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDepthwiseConv2dNativeKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct DepthwiseConv2dNativeKernel
{
@@ -89,9 +99,9 @@ private:
*
* @param[in] window Region on which to execute the kernel.
*/
- DepthwiseConv2dNativeKernelPtr _func{ nullptr };
+ DepthwiseConv2dNativeKernelPtr _func{nullptr};
ConvolutionInfo _conv_info{};
- bool _has_biases{ false };
+ bool _has_biases{false};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp
index a2d24f9243..d17128b5ac 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.cpp
+++ b/src/cpu/kernels/CpuDequantizeKernel.cpp
@@ -28,12 +28,13 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/NESymm.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
#include <arm_neon.h>
@@ -48,9 +49,11 @@ namespace
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+ DataType::QSYMM16);
- if(dst->tensor_shape().total_size() > 0)
+ if (dst->tensor_shape().total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
@@ -124,28 +127,30 @@ void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Win
Iterator in(input, win_collapsed);
Iterator out(output, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr());
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize(vin, scale, offset);
+ const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
- store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
- }
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(in_ptr + x);
+ const auto vdeq = vdequantize(vin, scale, offset);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- auto val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
- }
- },
- in, out);
+ store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ auto val = *(in_ptr + x);
+ *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
+ }
+ },
+ in, out);
}
template <typename T>
@@ -165,28 +170,30 @@ void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *o
Iterator in(input, win);
Iterator out(output, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize(vin, scale[id.z()]);
+ const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr());
- store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
- }
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(in_ptr + x);
+ const auto vdeq = vdequantize(vin, scale[id.z()]);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int8_t val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
- }
- },
- in, out);
+ store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ int8_t val = *(in_ptr + x);
+ *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
+ }
+ },
+ in, out);
}
template <typename T>
@@ -206,37 +213,34 @@ void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *o
Iterator in(input, win);
Iterator out(output, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float32x4x4_t vscale =
+ const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
- scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
- scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
- scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
- }
- };
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize(vin, vscale);
-
- store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int8_t val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
- }
- },
- in, out);
+ const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4],
+ scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9],
+ scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13],
+ scale[x + 14], scale[x + 15]}};
+ const auto vin = wrapper::vloadq(in_ptr + x);
+ const auto vdeq = vdequantize(vin, vscale);
+
+ store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ int8_t val = *(in_ptr + x);
+ *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
+ }
+ },
+ in, out);
}
template <typename T>
@@ -257,28 +261,30 @@ void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Wind
Iterator in(input, win_collapsed);
Iterator out(output, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize(vin, scale);
+ const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr());
- store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
- }
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(in_ptr + x);
+ const auto vdeq = vdequantize(vin, scale);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int8_t val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
- }
- },
- in, out);
+ store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ int8_t val = *(in_ptr + x);
+ *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
+ }
+ },
+ in, out);
}
template <typename T>
@@ -299,34 +305,36 @@ void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Win
Iterator in(input, win_collapsed);
Iterator out(output, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize_int16(vin, scale);
+ const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr());
- store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
- }
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(in_ptr + x);
+ const auto vdeq = vdequantize_int16(vin, scale);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int16_t val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
- }
- },
- in, out);
+ store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ int16_t val = *(in_ptr + x);
+ *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
+ }
+ },
+ in, out);
}
template <typename T>
void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
{
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::QASYMM8:
run_dequantization_qasymm8<T, uint8_t>(input, output, window);
@@ -335,7 +343,9 @@ void run_dequantization_core(const ITensor *input, ITensor *output, const Window
run_dequantization_qasymm8<T, int8_t>(input, output, window);
break;
case DataType::QSYMM8_PER_CHANNEL:
- input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
+ input->info()->data_layout() == DataLayout::NHWC
+ ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window)
+ : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
break;
case DataType::QSYMM8:
run_dequantization_qsymm8<T>(input, output, window);
@@ -377,7 +387,7 @@ void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, con
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- switch(dst->info()->data_type())
+ switch (dst->info()->data_type())
{
case DataType::F32:
run_dequantization_core<float>(src, dst, window);
diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h
index cfa991dc74..6ed58587c9 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.h
+++ b/src/cpu/kernels/CpuDequantizeKernel.h
@@ -54,7 +54,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
index a4cdddee5e..4cb0fb1c40 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -22,13 +22,14 @@
* SOFTWARE.
*/
#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
-#include "src/cpu/kernels/directconv2d/list.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/directconv2d/list.h"
using namespace arm_compute::detail;
@@ -38,26 +39,25 @@ namespace cpu
{
namespace kernels
{
-static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels =
-{
- {
- "neon_fp32_nhwc_directconv2d",
- [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
- REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)
- },
- {
- "neon_fp32_nchw_directconv2d",
- [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
- REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)
- },
- {
- "neon_fp16_nchw_directconv2d",
- [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)
- },
+static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels = {
+ {"neon_fp32_nhwc_directconv2d",
+ [](const DataTypeDataLayoutISASelectorData &data)
+ { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
+ REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)},
+ {"neon_fp32_nchw_directconv2d",
+ [](const DataTypeDataLayoutISASelectorData &data)
+ { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
+ REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)},
+ {"neon_fp16_nchw_directconv2d",
+ [](const DataTypeDataLayoutISASelectorData &data)
+ { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)},
};
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
@@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
ARM_COMPUTE_UNUSED(width_idx);
// Checks performed when output is configured
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
@@ -100,11 +100,15 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
// Configure window without any padding
win = calculate_max_window(*dst, Steps());
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
-void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
+void CpuDirectConv2dKernel::configure(ITensorInfo *src,
+ ITensorInfo *weights,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -129,12 +133,13 @@ void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, IT
ICpuKernel::configure(win_config.second);
}
-Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status CpuDirectConv2dKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
- dst->clone().get())
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
return Status{};
}
@@ -149,7 +154,8 @@ void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, c
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- const auto *uk = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ src->info()->data_type(), _data_layout, CPUInfo::get().get_isa() });
+ const auto *uk = CpuDirectConv2dKernel::get_implementation(
+ DataTypeDataLayoutISASelectorData{src->info()->data_type(), _data_layout, CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON(uk == nullptr);
uk->ukernel(window, src, weights, dst, _conv_info);
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h
index b9265dc630..ad4caea193 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.h
@@ -37,7 +37,8 @@ namespace kernels
class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel>
{
private:
- using DirectConv2dKernel_Ptr = std::add_pointer<void(const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
+ using DirectConv2dKernel_Ptr = std::add_pointer<void(
+ const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
public:
CpuDirectConv2dKernel() = default;
@@ -64,10 +65,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct DirectConv2dKernel
@@ -81,8 +85,8 @@ public:
private:
PadStrideInfo _conv_info{};
- unsigned int _kernel_size{ 0 };
- DataLayout _data_layout{ DataLayout::UNKNOWN };
+ unsigned int _kernel_size{0};
+ DataLayout _data_layout{DataLayout::UNKNOWN};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
index 93ad5e5eba..d4af8bedaf 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
@@ -27,15 +27,16 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/NEFixedPoint.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
#include <arm_neon.h>
#include <cstddef>
@@ -49,7 +50,9 @@ namespace kernels
{
namespace
{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
const DirectConvolutionLayerOutputStageKernelInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
@@ -57,22 +60,23 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
- if(bias != nullptr)
+ if (bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(
+ src->data_layout(), DataLayoutDimension::CHANNEL)));
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
}
- if(src->data_type() == DataType::S32)
+ if (src->data_type() == DataType::S32)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
}
// Checks performed when output is configured
- if((dst != nullptr) && (dst->total_size() != 0))
+ if ((dst != nullptr) && (dst->total_size() != 0))
{
- if(is_data_type_float(src->data_type()))
+ if (is_data_type_float(src->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
}
@@ -82,10 +86,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
}
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
}
- else if(src->data_type() == DataType::S32)
+ else if (src->data_type() == DataType::S32)
{
// In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
- ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
+ ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) &&
+ (info.output_data_type != DataType::QASYMM8_SIGNED));
}
return Status{};
@@ -93,8 +98,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
template <typename T>
typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+output_stage_nchw(ITensor *src,
+ const ITensor *bias,
+ const Window &window,
+ ITensor *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int result_offset_after_shift)
{
const bool has_bias = bias != nullptr;
/** SIMD vector tag type. */
@@ -113,50 +123,57 @@ output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITens
Iterator in(src, win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
- auto v_in = wrapper::vloadq(in_ptr);
-
- // Accumulate bias
- if(has_bias)
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
- v_in = wrapper::vadd(v_in, vb);
- }
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+ auto v_in = wrapper::vloadq(in_ptr);
- const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
- wrapper::vstore(out_ptr, v_in);
- }
+ // Accumulate bias
+ if (has_bias)
+ {
+ const auto vb = wrapper::vdup_n(
+ *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+ v_in = wrapper::vadd(v_in, vb);
+ }
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Get bias and pointer to input
- auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+ wrapper::vstore(out_ptr, v_in);
+ }
- // Accumulate bias
- if(has_bias)
+ // Left-overs loop
+ for (; x < window_end_x; ++x)
{
- const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
- s_in += b;
- }
+ // Get bias and pointer to input
+ auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
- *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
- }
+ // Accumulate bias
+ if (has_bias)
+ {
+ const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+ s_in += b;
+ }
- },
- in, out);
+ *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
+ }
+ },
+ in, out);
}
template <typename T>
typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+output_stage_nhwc(ITensor *src,
+ const ITensor *bias,
+ const Window &window,
+ ITensor *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int result_offset_after_shift)
{
const bool has_bias = bias != nullptr;
ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
@@ -179,50 +196,59 @@ output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITens
Iterator bi(bias, window_bias);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
- auto v_in = wrapper::vloadq(in_ptr + x);
-
- // Accumulate bias
- if(has_bias)
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
- v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
- }
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+ auto v_in = wrapper::vloadq(in_ptr + x);
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
- wrapper::vstore(out_ptr + x, v_in);
- }
+ // Accumulate bias
+ if (has_bias)
+ {
+ const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+ v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+ }
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Get bias and pointer to input
- auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+ wrapper::vstore(out_ptr + x, v_in);
+ }
- // Accumulate bias
- if(has_bias)
+ // Left-overs loop
+ for (; x < window_end_x; ++x)
{
- const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
- s_in += *bias_ptr;
- }
+ // Get bias and pointer to input
+ auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
- *(out_ptr + x) = s_in;
- }
- },
- in, bi, out);
+ // Accumulate bias
+ if (has_bias)
+ {
+ const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+ s_in += *bias_ptr;
+ }
+
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+ *(out_ptr + x) = s_in;
+ }
+ },
+ in, bi, out);
}
// Quantized case
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+template <
+ typename TOut,
+ typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nchw(ITensor *src,
+ const ITensor *bias,
+ const Window &window,
+ ITensor *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int result_offset_after_shift)
{
const bool has_bias = bias != nullptr;
using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -242,67 +268,63 @@ void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window,
Iterator in(src, win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
- int32x4x4_t v_in =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+ int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8),
+ wrapper::vloadq(in_ptr + 12)}};
+
+ // Accumulate bias
+ if (has_bias)
{
- wrapper::vloadq(in_ptr),
- wrapper::vloadq(in_ptr + 4),
- wrapper::vloadq(in_ptr + 8),
- wrapper::vloadq(in_ptr + 12)
+ const auto vb = wrapper::vdup_n(
+ *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+ v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb),
+ wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}};
}
- };
- // Accumulate bias
- if(has_bias)
- {
- const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
- v_in =
- {
- {
- wrapper::vadd(v_in.val[0], vb),
- wrapper::vadd(v_in.val[1], vb),
- wrapper::vadd(v_in.val[2], vb),
- wrapper::vadd(v_in.val[3], vb)
- }
- };
+ const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+ wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+ result_offset_after_shift_s32, min, max, false));
}
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
- wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
- min, max, false));
- }
+ // Left-overs loop
+ for (; x < window_end_x; ++x)
+ {
+ // Get bias and pointer to input
+ int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Get bias and pointer to input
- int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+ // Accumulate bias
+ if (has_bias)
+ {
+ const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+ s_in += b;
+ }
- // Accumulate bias
- if(has_bias)
- {
- const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
- s_in += b;
+ const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+ *out_ptr =
+ finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+ std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
}
-
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
- *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
- std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
- }
- },
- in, out);
+ },
+ in, out);
}
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+template <
+ typename TOut,
+ typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nhwc(ITensor *src,
+ const ITensor *bias,
+ const Window &window,
+ ITensor *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int result_offset_after_shift)
{
const bool has_bias = bias != nullptr;
using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -329,62 +351,65 @@ void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window,
Iterator bi(bias, window_bias);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
- int32x4x4_t v_in =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+ int32x4x4_t v_in = {{
+ wrapper::vloadq(in_ptr),
+ wrapper::vloadq(in_ptr + 4),
+ wrapper::vloadq(in_ptr + 8),
+ wrapper::vloadq(in_ptr + 12),
+ }};
+
+ // Accumulate bias
+ if (has_bias)
{
- wrapper::vloadq(in_ptr),
- wrapper::vloadq(in_ptr + 4),
- wrapper::vloadq(in_ptr + 8),
- wrapper::vloadq(in_ptr + 12),
- }
- };
+ const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
- // Accumulate bias
- if(has_bias)
- {
- const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+ wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+ wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+ wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+ wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+ }
- wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
- wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
- wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
- wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+ const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+ wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+ result_offset_after_shift_s32, min, max, false));
}
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
- wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
- }
+ // Left-overs loop
+ for (; x < window_end_x; ++x)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+ int32_t s_in = *in_ptr;
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
- int32_t s_in = *in_ptr;
+ // Accumulate bias
+ if (has_bias)
+ {
+ const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+ s_in += *bias_ptr;
+ }
- // Accumulate bias
- if(has_bias)
- {
- const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
- s_in += *bias_ptr;
+ const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+ *out_ptr =
+ finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+ std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
}
-
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
- *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
- std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
- }
- },
- in, bi, out);
+ },
+ in, bi, out);
}
} // namespace
-void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
const DirectConvolutionLayerOutputStageKernelInfo &info)
{
ARM_COMPUTE_UNUSED(bias);
@@ -398,7 +423,7 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
_result_offset_after_shift = info.result_offset_after_shift;
// Auto-initialize output output if required
- if(dst != nullptr)
+ if (dst != nullptr)
{
// Work out expected output data type
const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
@@ -410,16 +435,17 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
ICpuKernel::configure(win);
- const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
+ const bool is_qasymm8_signed =
+ (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
// Set appropriate function
- if(src->data_layout() == DataLayout::NCHW)
+ if (src->data_layout() == DataLayout::NCHW)
{
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::S32:
{
- if(is_qasymm8_signed)
+ if (is_qasymm8_signed)
{
_func = &output_stage_nchw<int8_t>;
}
@@ -449,11 +475,11 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
}
else
{
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::S32:
{
- if(is_qasymm8_signed)
+ if (is_qasymm8_signed)
{
_func = &output_stage_nhwc<int8_t>;
}
@@ -483,7 +509,9 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
}
}
-Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
const DirectConvolutionLayerOutputStageKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
index d3ef17b7c9..ce84f49cf6 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -55,29 +56,40 @@ public:
* Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
* @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
*/
- void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
- const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+ void
+ configure(ITensorInfo *src,
+ const ITensorInfo *bias = nullptr,
+ ITensorInfo *dst = nullptr,
+ const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDirectConv2dOutputStageKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
- const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+ static Status
+ validate(const ITensorInfo *src,
+ const ITensorInfo *bias = nullptr,
+ const ITensorInfo *dst = nullptr,
+ const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
- using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
+ using OutputStageKernel = void(ITensor *src,
+ const ITensor *bias,
+ const Window &window,
+ ITensor *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int result_offset_after_shift);
- OutputStageKernel *_func{ nullptr };
- int _result_fixedpoint_multiplier{ 0 };
- int _result_shift{ 0 };
- int _result_offset_after_shift{ 0 };
+ OutputStageKernel *_func{nullptr};
+ int _result_fixedpoint_multiplier{0};
+ int _result_shift{0};
+ int _result_offset_after_shift{0};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
index 22c60cd994..b5b2aed1ba 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
@@ -29,12 +29,13 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/conv3d/neon/list.h"
#include <algorithm>
@@ -49,43 +50,37 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels =
-{
+static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = {
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "neon_fp16_directconv3d",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)
- },
+ {"neon_fp16_directconv3d",
+ [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)},
#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- {
- "neon_fp32_directconv3d",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)
- },
- {
- "neon_qasymm8_directconv3d",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)
- },
- {
- "neon_qasymm8_signed_directconv3d",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)
- }
-};
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info)
+ {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)},
+ {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)},
+ {"neon_qasymm8_signed_directconv3d",
+ [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)}};
+
+Status validate_arguments(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo &conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
ARM_COMPUTE_RETURN_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size3D(1U, 1U, 1U));
- const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+ const auto *uk =
+ CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
@@ -96,9 +91,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 5);
ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != src0->dimension(channel_idx));
- if(src2 != nullptr)
+ if (src2 != nullptr)
{
- if(is_data_type_quantized(src0->data_type()))
+ if (is_data_type_quantized(src0->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
}
@@ -106,14 +101,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+ "Biases size and number of dst feature maps should match");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
}
// Checks performed when output is configured
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
- TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+ TensorShape output_shape =
+ misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
DataType data_type = src0->data_type();
@@ -125,12 +122,17 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
}
} // namespace
-void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info)
+void CpuDirectConv3dKernel::configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ ITensorInfo *dst,
+ const Conv3dInfo &conv_info)
{
ARM_COMPUTE_UNUSED(src2);
ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
- const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+ const auto *uk =
+ CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
@@ -139,7 +141,8 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo
_name = std::string("CpuDirectConv3dKernel").append("/").append(uk->name);
// Get convolved dimensions
- TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+ TensorShape output_shape =
+ misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
DataType data_type = src0->data_type();
@@ -154,7 +157,11 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo
ICpuKernel::configure(win);
}
-Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info)
+Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo &conv_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv_info));
@@ -188,4 +195,4 @@ const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> &CpuDirectConv3dKer
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h
index 688f368b9f..8e6f564679 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H
#include "arm_compute/runtime/FunctionDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -39,7 +40,8 @@ class CpuDirectConv3dKernel : public ICpuKernel<CpuDirectConv3dKernel>
{
private:
/* Template function for convolution 3d NDHWC */
- using DirectConv3dKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
+ using DirectConv3dKernelPtr = std::add_pointer<void(
+ const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
public:
CpuDirectConv3dKernel() = default;
@@ -63,17 +65,25 @@ public:
* @param[in] conv_info Contains padding, stride, acitvation information.
*
*/
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info);
+ void configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ ITensorInfo *dst,
+ const Conv3dInfo &conv_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDirectConv3dKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info);
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo &conv_info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct DirectConv3dKernel
@@ -87,7 +97,7 @@ public:
private:
Conv3dInfo _conv_info{};
- DirectConv3dKernelPtr _run_method{ nullptr };
+ DirectConv3dKernelPtr _run_method{nullptr};
std::string _name{};
};
diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
index a045855b1a..57a3f39822 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp
@@ -24,8 +24,9 @@
#include "src/cpu/kernels/CpuElementwiseKernel.h"
#include "arm_compute/core/Helpers.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/elementwise_binary/list.h"
@@ -35,11 +36,11 @@
#if defined(ENABLE_FP32_KERNELS)
namespace
{
- static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
- static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
- static constexpr size_t default_div_mws_N1_fp32_neon = 19043;
- static constexpr size_t default_div_mws_V1_fp32_neon = 25511;
-}
+static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
+static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
+static constexpr size_t default_div_mws_N1_fp32_neon = 19043;
+static constexpr size_t default_div_mws_V1_fp32_neon = 25511;
+} // namespace
#endif /* ENABLE_FP32_KERNELS */
namespace arm_compute
@@ -50,255 +51,178 @@ namespace kernels
{
namespace
{
-template <ArithmeticOperation op>
-const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic =
-{
- {
- "sve2_qu8_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)
- },
- {
- "sve2_qs8_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)
- },
- {
- "sve_fp32_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)
- },
- {
- "sve_s32_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)
- },
- {
- "sve_s16_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)
- },
- {
- "sve_fp16_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)
- },
- {
- "neon_fp32_arithmetic",
-
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)
- },
- {
- "neon_s32_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)
- },
- {
- "neon_fp16_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)
- },
- {
- "neon_s16_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)
- },
- {
- "neon_qu8_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)
- },
- {
- "neon_qs8_arithmetic",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op;
- },
- REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)
- },
+template <ArithmeticOperation op>
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic = {
+ {"sve2_qu8_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)},
+ {"sve2_qs8_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data) {
+ return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
+ },
+ REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)},
+ {"sve_fp32_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)},
+ {"sve_s32_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)},
+ {"sve_s16_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)},
+ {"sve_fp16_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+ static_cast<ArithmeticOperation>(data.op) == op;
+ },
+ REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)},
+ {"neon_fp32_arithmetic",
+
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)},
+ {"neon_s32_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)},
+ {"neon_fp16_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)},
+ {"neon_s16_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)},
+ {"neon_qu8_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)},
+ {"neon_qs8_arithmetic",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op; },
+ REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)},
};
-template <ComparisonOperation op>
-const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison =
-{
- {
- "sve2_qu8_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)
- },
- {
- "sve2_qs8_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)
- },
- {
- "sve_u8_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)
- },
- {
- "sve_fp32_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)
- },
- {
- "sve_s16_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)
- },
- {
- "sve_s32_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)
- },
- {
- "sve_fp16_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)
- },
- {
- "neon_u8_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)
- },
- {
- "neon_fp32_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)
- },
- {
- "neon_s16_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)
- },
- {
- "neon_s32_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)
- },
- {
- "neon_qu8_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)
- },
- {
- "neon_qs8_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)
- },
- {
- "neon_fp16_comparison",
- [](const ElementwiseDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op;
- },
- REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)
- },
+template <ComparisonOperation op>
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison = {
+ {"sve2_qu8_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)},
+ {"sve2_qs8_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data) {
+ return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
+ },
+ REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)},
+ {"sve_u8_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)},
+ {"sve_fp32_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)},
+ {"sve_s16_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)},
+ {"sve_s32_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)},
+ {"sve_fp16_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+ static_cast<ComparisonOperation>(data.op) == op;
+ },
+ REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)},
+ {"neon_u8_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)},
+ {"neon_fp32_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)},
+ {"neon_s16_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)},
+ {"neon_s32_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)},
+ {"neon_qu8_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)},
+ {"neon_qs8_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)},
+ {"neon_fp16_comparison",
+ [](const ElementwiseDataTypeISASelectorData &data)
+ { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op; },
+ REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)},
};
} // namespace
-const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &CpuArithmeticKernel::get_available_kernels()
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &
+CpuArithmeticKernel::get_available_kernels()
{
static std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels;
- std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(), available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(), available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(), available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(), available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(), available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(), available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(), available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(), available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(),
+ available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(),
+ available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(),
+ available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(),
+ available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(),
+ available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(),
+ available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(),
+ std::back_inserter(available_kernels));
+ std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(),
+ available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(),
+ available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels));
return available_kernels;
}
-const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &CpuComparisonKernel::get_available_kernels()
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &
+CpuComparisonKernel::get_available_kernels()
{
static std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels;
- std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(), available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(), available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(), available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(), available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(), available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels));
- std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(), available_kernels_comperison<ComparisonOperation::LessEqual>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(),
+ available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(),
+ available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(),
+ available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(),
+ available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(),
+ std::back_inserter(available_kernels));
+ std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(),
+ available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels));
+ std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(),
+ available_kernels_comperison<ComparisonOperation::LessEqual>.end(),
+ std::back_inserter(available_kernels));
return available_kernels;
}
template <class Derived>
-Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
+Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0,
+ const ITensorInfo &src1,
+ const ITensorInfo &dst)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
@@ -308,7 +232,7 @@ Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInf
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
// Validate in case of configured dst
- if(dst.total_size() > 0)
+ if (dst.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
"Wrong shape for output");
@@ -321,7 +245,8 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
- const auto *uk = CpuArithmeticKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) });
+ const auto *uk = CpuArithmeticKernel::get_implementation(
+ ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
@@ -329,7 +254,7 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso
_name = std::string("CpuArithmeticKernel").append("/").append(uk->name);
// If any of shapes is dynamic, expect a configured window and dst at run-time.
- if(src0->is_dynamic() || src1->is_dynamic())
+ if (src0->is_dynamic() || src1->is_dynamic())
{
return;
}
@@ -343,7 +268,8 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
- const auto *uk = CpuComparisonKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) });
+ const auto *uk = CpuComparisonKernel::get_implementation(
+ ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
@@ -351,7 +277,7 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso
_name = std::string("CpuComparisonKernel").append("/").append(uk->name);
// If any of shapes is dynamic, expect a configured window and dst at run-time.
- if(src0->is_dynamic() || src1->is_dynamic())
+ if (src0->is_dynamic() || src1->is_dynamic())
{
return;
}
@@ -373,8 +299,10 @@ void CpuElementwiseKernel<Derived>::run_op(ITensorPack &tensors, const Window &w
_run_method(src0, src1, dst, window);
}
-template void CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
-template void CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+template void
+CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+template void
+CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
template <class Derived>
const char *CpuElementwiseKernel<Derived>::name() const
@@ -385,7 +313,10 @@ template const char *CpuElementwiseKernel<CpuArithmeticKernel>::name() const;
template const char *CpuElementwiseKernel<CpuComparisonKernel>::name() const;
/** Arithmetic operators (min, max, squared_diff) */
-void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+void CpuArithmeticKernel::configure(ArithmeticOperation op,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst)
{
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
_op = op;
@@ -394,16 +325,20 @@ void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *s
Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S16, DataType::F16, DataType::S32, DataType::F32);
// Validate in case of configured dst
- if(dst.total_size() > 0)
+ if (dst.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
}
return validate_arguments_common(src0, src1, dst);
}
-Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status CpuArithmeticKernel::validate(ArithmeticOperation op,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst)
{
ARM_COMPUTE_UNUSED(op);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
@@ -416,15 +351,15 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count
ARM_COMPUTE_UNUSED(thread_count);
#if defined(ENABLE_FP32_KERNELS)
- if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN>
- || this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
+ if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN> ||
+ this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
{
size_t mws = ICPPKernel::default_mws;
- if(platform.get_cpu_model() == CPUModel::N1)
+ if (platform.get_cpu_model() == CPUModel::N1)
{
mws = default_min_max_mws_N1_fp32_neon;
}
- else if(platform.get_cpu_model() == CPUModel::V1)
+ else if (platform.get_cpu_model() == CPUModel::V1)
{
mws = default_min_max_mws_V1_fp32_neon;
}
@@ -434,7 +369,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count
}
// tensor is 1D or was re-interpreted as 1D
- if(this->window().shape().num_dimensions() == 1)
+ if (this->window().shape().num_dimensions() == 1)
{
return mws;
}
@@ -447,7 +382,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count
return std::max(static_cast<size_t>(1), mws);
}
}
-#else /* ENABLE_FP32_KERNELS */
+#else /* ENABLE_FP32_KERNELS */
ARM_COMPUTE_UNUSED(platform);
#endif /* ENABLE_FP32_KERNELS */
return ICPPKernel::default_mws;
@@ -467,14 +402,14 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count)
ARM_COMPUTE_UNUSED(thread_count);
#if defined(ENABLE_FP32_KERNELS)
- if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
+ if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
{
size_t mws = ICPPKernel::default_mws;
- if(platform.get_cpu_model() == CPUModel::N1)
+ if (platform.get_cpu_model() == CPUModel::N1)
{
mws = default_div_mws_N1_fp32_neon;
}
- else if(platform.get_cpu_model() == CPUModel::V1)
+ else if (platform.get_cpu_model() == CPUModel::V1)
{
mws = default_div_mws_V1_fp32_neon;
}
@@ -484,7 +419,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count)
}
// tensor is 1D or was re-interpreted as 1D
- if(this->window().shape().num_dimensions() == 1)
+ if (this->window().shape().num_dimensions() == 1)
{
return mws;
}
@@ -497,7 +432,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count)
return std::max(static_cast<size_t>(1), mws);
}
}
-#else /* ENABLE_FP32_KERNELS */
+#else /* ENABLE_FP32_KERNELS */
ARM_COMPUTE_UNUSED(platform);
#endif /* ENABLE_FP32_KERNELS */
return ICPPKernel::default_mws;
@@ -538,7 +473,10 @@ Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1
}
/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+void CpuComparisonKernel::configure(ComparisonOperation op,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst)
{
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
_op = op;
@@ -547,16 +485,21 @@ void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *s
Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16,
+ DataType::S32, DataType::F32);
// Validate in case of configured dst
- if(dst.total_size() > 0)
+ if (dst.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
}
return validate_arguments_common(src0, src1, dst);
}
-Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status CpuComparisonKernel::validate(ComparisonOperation op,
+ const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst)
{
ARM_COMPUTE_UNUSED(op);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h
index 634e38bf9f..1f3e613b80 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/cpu/kernels/CpuElementwiseKernel.h
@@ -43,7 +43,8 @@ template <class Derived>
class CpuElementwiseKernel : public ICpuKernel<Derived>
{
private:
- using ElementwiseKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+ using ElementwiseKernelPtr =
+ std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
public:
CpuElementwiseKernel() = default;
@@ -72,7 +73,7 @@ protected:
static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
protected:
- ElementwiseKernelPtr _run_method{ nullptr };
+ ElementwiseKernelPtr _run_method{nullptr};
std::string _name{};
};
@@ -96,7 +97,8 @@ public:
*
* @return a status
*/
- static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+ static Status
+ validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
static const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &get_available_kernels();
@@ -200,7 +202,8 @@ public:
*
* @return a status
*/
- static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+ static Status
+ validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
static const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &get_available_kernels();
@@ -226,4 +229,4 @@ private:
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 04a7f15715..88545ee756 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -28,8 +28,9 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/elementwise_unary/list.h"
@@ -59,12 +60,13 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
const auto dst_min_fp = (((is_signed) ? -128 : 0) - dst_qi.offset) * dst_qi.scale;
const auto dst_max_fp = (((is_signed) ? 127 : 255) - dst_qi.offset) * dst_qi.scale;
- for(int i = 0; i < 256; ++i)
+ for (int i = 0; i < 256; ++i)
{
- const auto in = (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
- float result = 0;
+ const auto in =
+ (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
+ float result = 0;
- switch(op)
+ switch (op)
{
case ElementWiseUnary::RSQRT:
result = 1 / sqrt(in);
@@ -100,7 +102,8 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
result = utility::clamp(result, dst_min_fp, dst_max_fp);
- const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi)) : quantize_qasymm8(result, dst_qi);
+ const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi))
+ : quantize_qasymm8(result, dst_qi);
lut[i] = out;
}
@@ -109,97 +112,68 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
#endif // __aarch64__
-static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels =
-{
+static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels = {
{
"sve_fp32_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return (data.dt == DataType::F32 && data.isa.sve);
- },
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32 && data.isa.sve); },
REGISTER_FP32_SVE(sve_fp32_elementwise_unary),
nullptr,
},
{
"sve_fp16_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16);
- },
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); },
REGISTER_FP16_SVE(sve_fp16_elementwise_unary),
nullptr,
},
{
"sve_s32_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return (data.dt == DataType::S32 && data.isa.sve);
- },
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::S32 && data.isa.sve); },
REGISTER_INTEGER_SVE(sve_s32_elementwise_unary),
nullptr,
},
{
"neon_fp32_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return data.dt == DataType::F32;
- },
+ [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(neon_fp32_elementwise_unary),
nullptr,
},
{
"neon_fp16_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return data.dt == DataType::F16 && data.isa.fp16;
- },
+ [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
REGISTER_FP16_NEON(neon_fp16_elementwise_unary),
nullptr,
},
{
"neon_s32_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return data.dt == DataType::S32;
- },
+ [](const DataTypeISASelectorData &data) { return data.dt == DataType::S32; },
REGISTER_INTEGER_NEON(neon_s32_elementwise_unary),
nullptr,
},
#ifdef __aarch64__
{
"sve2_q8_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2;
- },
+ [](const DataTypeISASelectorData &data)
+ { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
REGISTER_QASYMM8_SVE2(sve2_q8_elementwise_unary),
&q8_prepare_lut,
},
{
"neon_q8_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED;
- },
+ [](const DataTypeISASelectorData &data)
+ { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
REGISTER_QASYMM8_NEON(neon_q8_elementwise_unary),
&q8_prepare_lut,
},
#else // __aarch64__
{
"neon_qasymm8_signed_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8_SIGNED;
- },
+ [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_unary),
nullptr,
},
{
"neon_qasymm8_elementwise_unary",
- [](const DataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8;
- },
+ [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_unary),
nullptr,
},
@@ -211,7 +185,8 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai
void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
{
ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
- const auto uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
+ const auto uk = CpuElementwiseUnaryKernel::get_implementation(
+ DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
_op = op;
@@ -219,12 +194,12 @@ void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo
_name = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name);
// If input shape is dynamic, expect a configured window and dst at run-time.
- if(src.is_dynamic())
+ if (src.is_dynamic())
{
return;
}
- if(uk->prepare_func != nullptr)
+ if (uk->prepare_func != nullptr)
{
_lut = uk->prepare_func(op, &src, &dst);
}
@@ -238,28 +213,31 @@ Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInf
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
- const auto *uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
+ const auto *uk = CpuElementwiseUnaryKernel::get_implementation(
+ DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
- switch(op)
+ switch (op)
{
case ElementWiseUnary::EXP:
case ElementWiseUnary::RSQRT:
case ElementWiseUnary::LOG:
case ElementWiseUnary::ROUND:
case ElementWiseUnary::SIN:
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32,
+ DataType::QASYMM8, DataType::QASYMM8_SIGNED);
break;
case ElementWiseUnary::NEG:
case ElementWiseUnary::ABS:
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32,
+ DataType::QASYMM8, DataType::QASYMM8_SIGNED);
break;
default:
ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
}
// Validate in case of configured dst
- if(dst.total_size() > 0)
+ if (dst.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
}
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
index 00188f0d49..249909854e 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -42,8 +43,10 @@ namespace kernels
class CpuElementwiseUnaryKernel : public ICpuKernel<CpuElementwiseUnaryKernel>
{
private:
- using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type;
- using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type;
+ using ElementwiseUnaryUkernelPtr =
+ std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type;
+ using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(
+ ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type;
public:
CpuElementwiseUnaryKernel() = default;
@@ -65,7 +68,7 @@ public:
static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct ElementwiseUnaryKernel
@@ -80,7 +83,7 @@ public:
private:
ElementWiseUnary _op{};
- ElementwiseUnaryUkernelPtr _run_method{ nullptr };
+ ElementwiseUnaryUkernelPtr _run_method{nullptr};
std::string _name{};
std::unique_ptr<uint8_t[]> _lut{};
};
diff --git a/src/cpu/kernels/CpuFillKernel.cpp b/src/cpu/kernels/CpuFillKernel.cpp
index f69de0082d..754da97ae1 100644
--- a/src/cpu/kernels/CpuFillKernel.cpp
+++ b/src/cpu/kernels/CpuFillKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -68,17 +69,18 @@ void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator tensor_it(inout, collapsed);
- execute_window_loop(collapsed, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + tensor_it.offset();
- // Set memory
- for(int i = 0; i < window_width; ++i)
+ execute_window_loop(
+ collapsed,
+ [&](const Coordinates &)
{
- std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
- }
-
- },
- tensor_it);
+ uint8_t *base_addr = start_valid_region + tensor_it.offset();
+ // Set memory
+ for (int i = 0; i < window_width; ++i)
+ {
+ std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
+ }
+ },
+ tensor_it);
}
const char *CpuFillKernel::name() const
diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h
index ce41afc462..7c200c9b59 100644
--- a/src/cpu/kernels/CpuFillKernel.h
+++ b/src/cpu/kernels/CpuFillKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_FILL_KERNEL_H
#include "arm_compute/core/PixelValue.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -48,7 +49,7 @@ public:
void configure(const ITensorInfo *tensor, const PixelValue &constant_value);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp
index 65e390a81a..df7e6aad46 100644
--- a/src/cpu/kernels/CpuFloorKernel.cpp
+++ b/src/cpu/kernels/CpuFloorKernel.cpp
@@ -27,11 +27,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
#include "src/cpu/kernels/floor/list.h"
namespace arm_compute
@@ -42,29 +42,22 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuFloorKernel::FloorKernel> available_kernels =
-{
- {
- "neon_fp16_floor",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
- },
- {
- "neon_fp32_floor",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
- }
-};
+static const std::vector<CpuFloorKernel::FloorKernel> available_kernels = {
+ {"neon_fp16_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)},
+ {"neon_fp32_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)}};
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ const auto *uk =
+ CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
// Validate in case of configured output
- if(dst->total_size() > 0)
+ if (dst->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -81,7 +74,8 @@ void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
- const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ const auto *uk =
+ CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
_run_method = uk->ukernel;
@@ -122,17 +116,14 @@ void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const Th
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src_it(src, win);
Iterator dst_it(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- _run_method(src_it.ptr(), dst_it.ptr(), len);
- },
- src_it, dst_it);
+ execute_window_loop(
+ win, [&](const Coordinates &) { _run_method(src_it.ptr(), dst_it.ptr(), len); }, src_it, dst_it);
}
const char *CpuFloorKernel::name() const
diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h
index 35ab534ca8..57107d0532 100644
--- a/src/cpu/kernels/CpuFloorKernel.h
+++ b/src/cpu/kernels/CpuFloorKernel.h
@@ -65,7 +65,7 @@ public:
Window infer_window(const ITensorInfo *src, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct FloorKernel
@@ -78,7 +78,7 @@ public:
static const std::vector<FloorKernel> &get_available_kernels();
private:
- FloorKernelPtr _run_method{ nullptr };
+ FloorKernelPtr _run_method{nullptr};
std::string _name{};
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
index 9fbf2d54c6..db433c99a8 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
@@ -24,9 +24,10 @@
#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -60,7 +61,7 @@ Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITenso
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
const TensorShape dst_shape = compute_interleaved_shape(*src);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -111,35 +112,42 @@ void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &wind
Iterator in(src, win);
Iterator out(dst, win_out);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- if(id.y() + 4 <= static_cast<int>(in_height))
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- for(size_t x = window_start_x; x < window_end_x; ++x)
+ if (id.y() + 4 <= static_cast<int>(in_height))
{
- std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size);
- std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size);
- std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size);
- std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size);
- }
- }
- else
- {
- for(size_t x = window_start_x; x < window_end_x; ++x)
- {
- size_t y = 0;
- for(; y < partial_y; ++y)
+ for (size_t x = window_start_x; x < window_end_x; ++x)
{
- std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size);
+ std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size,
+ element_size);
+ std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size,
+ element_size);
+ std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size,
+ element_size);
+ std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size,
+ element_size);
}
- for(; y < 4; ++y)
+ }
+ else
+ {
+ for (size_t x = window_start_x; x < window_end_x; ++x)
{
- std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+ size_t y = 0;
+ for (; y < partial_y; ++y)
+ {
+ std::memcpy(out.ptr() + (x * 4 + y) * element_size,
+ (in.ptr() + y * in_stride) + x * element_size, element_size);
+ }
+ for (; y < 4; ++y)
+ {
+ std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+ }
}
}
- }
- },
- in, out);
+ },
+ in, out);
}
const char *CpuGemmInterleave4x4Kernel::name() const
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
index 4fb6a52a8b..2ce34bc4bc 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
@@ -71,7 +71,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
index f8bef64066..a3ed2cd171 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -44,646 +45,494 @@ namespace kernels
{
namespace
{
-void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_u8(Iterator &ina,
+ Iterator &inb,
+ Iterator &out,
+ int width_a,
+ int width_b,
+ int width_out,
+ size_t stride_b,
+ const Window &window)
{
- execute_window_loop(window, [&](const Coordinates & id)
- {
- if(id.x() > width_b)
- {
- return;
- }
-
- // Note: Since the input are all positives, we can use uint32_t
- // Accumulators for the block 0
- uint32x4x4_t c0 =
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
+ if (id.x() > width_b)
{
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0)
+ return;
}
- };
-
- auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr());
- auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr());
- auto vec_a_end_addr = vec_a + width_a;
-
- // This for loop performs 8 accumulations
- for(; vec_a <= (vec_a_end_addr - 8);)
- {
- const uint8x8_t a00_u8 = vld1_u8(vec_a);
- const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
- const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
- const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
- const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
- const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
- const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
- const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
- const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
-
- // Convert a00_u8 to uint16_t and get the lower part
- const uint16x4x2_t a00_u16 =
- {
- {
- vget_low_u16(vmovl_u8(a00_u8)),
- vget_high_u16(vmovl_u8(a00_u8))
- }
- };
-
- const uint16x4x4_t b00_u16 =
- {
- {
- vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
- }
- };
-
- const uint16x4x4_t b10_u16 =
- {
- {
- vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
- }
- };
-
- const uint16x4x4_t b20_u16 =
- {
- {
- vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
- }
- };
- const uint16x4x4_t b30_u16 =
- {
- {
- vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
- }
- };
+ // Note: Since the input are all positives, we can use uint32_t
+ // Accumulators for the block 0
+ uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+ auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr());
+ auto vec_a_end_addr = vec_a + width_a;
+
+ // This for loop performs 8 accumulations
+ for (; vec_a <= (vec_a_end_addr - 8);)
+ {
+ const uint8x8_t a00_u8 = vld1_u8(vec_a);
+ const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
+ const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
+ const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
+ const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
+ const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
+ const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
+ const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
+ const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
+
+ // Convert a00_u8 to uint16_t and get the lower part
+ const uint16x4x2_t a00_u16 = {{vget_low_u16(vmovl_u8(a00_u8)), vget_high_u16(vmovl_u8(a00_u8))}};
+
+ const uint16x4x4_t b00_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+ const uint16x4x4_t b10_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))}};
+
+ const uint16x4x4_t b20_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))}};
+
+ const uint16x4x4_t b30_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))}};
+
+ const uint16x4x4_t b40_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))}};
+
+ const uint16x4x4_t b50_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))}};
+
+ const uint16x4x4_t b60_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))}};
+
+ const uint16x4x4_t b70_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))}};
+
+ // Accumulate 0:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
+
+ // Accumulate 1:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
+
+ // Accumulate 2:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
+
+ // Accumulate 3:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
+
+ // Accumulate 4:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
+
+ // Accumulate 5:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
+
+ // Accumulate 6:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
+
+ // Accumulate 7:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
+
+ vec_a += 8;
+ matrix_b += 8 * stride_b;
+ }
- const uint16x4x4_t b40_u16 =
+ // This for loop performs the left-over accumulations
+ for (; vec_a < vec_a_end_addr;)
{
- {
- vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
- }
- };
+ const uint8x8_t a00_u8 = vld1_dup_u8(vec_a);
+ const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
- const uint16x4x4_t b50_u16 =
- {
- {
- vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
- }
- };
+ const uint16x4x4_t b00_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
- const uint16x4x4_t b60_u16 =
- {
- {
- vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
- }
- };
+ // Convert a00_u8 to uint16_t and get the lower part
+ const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
- const uint16x4x4_t b70_u16 =
- {
- {
- vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
- }
- };
-
- // Accumulate 0:
- c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
-
- // Accumulate 1:
- c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
-
- // Accumulate 2:
- c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
-
- // Accumulate 3:
- c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
-
- // Accumulate 4:
- c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
-
- // Accumulate 5:
- c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
-
- // Accumulate 6:
- c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
-
- // Accumulate 7:
- c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
-
- vec_a += 8;
- matrix_b += 8 * stride_b;
- }
+ // Accumulate 0:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
- // This for loop performs the left-over accumulations
- for(; vec_a < vec_a_end_addr;)
- {
- const uint8x8_t a00_u8 = vld1_dup_u8(vec_a);
- const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
+ vec_a += 1;
+ matrix_b += stride_b;
+ }
- const uint16x4x4_t b00_u16 =
+ auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+ if (id.x() < (width_out - 16))
{
- {
- vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
- }
- };
-
- // Convert a00_u8 to uint16_t and get the lower part
- const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
- // Accumulate 0:
- c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
- vec_a += 1;
- matrix_b += stride_b;
- }
-
- auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
- if(id.x() < (width_out - 16))
- {
- vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
- vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
- vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
- vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
- }
- else
- {
- auto left_over = width_out - id.x();
- for(auto k = 0; k < 4 && left_over; ++k)
+ vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+ vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+ vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+ vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+ }
+ else
{
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ auto left_over = width_out - id.x();
+ for (auto k = 0; k < 4 && left_over; ++k)
{
- *(vec_out + k * 4 + j) = c0.val[k][j];
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+ {
+ *(vec_out + k * 4 + j) = c0.val[k][j];
+ }
}
}
- }
- },
- ina, inb, out);
+ },
+ ina, inb, out);
}
-void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_s8(Iterator &ina,
+ Iterator &inb,
+ Iterator &out,
+ int width_a,
+ int width_b,
+ int width_out,
+ size_t stride_b,
+ const Window &window)
{
- execute_window_loop(window, [&](const Coordinates & id)
- {
- if(id.x() > width_b)
- {
- return;
- }
-
- // Accumulators for the block 0
- int32x4x4_t c0 =
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
+ if (id.x() > width_b)
{
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0)
+ return;
}
- };
-
- auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr());
- auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr());
- auto vec_a_end_addr = vec_a + width_a;
-
- // This for loop performs 8 accumulations
- for(; vec_a <= (vec_a_end_addr - 8);)
- {
- const int8x8_t a00_s8 = vld1_s8(vec_a);
- const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
- const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
- const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
- const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
- const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
- const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
- const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
- const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
-
- // Convert a00_s8 to int16_t and get the lower part
- const int16x4x2_t a00_s16 =
- {
- {
- vget_low_s16(vmovl_s8(a00_s8)),
- vget_high_s16(vmovl_s8(a00_s8))
- }
- };
-
- const int16x4x4_t b00_s16 =
- {
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
- }
- };
-
- const int16x4x4_t b10_s16 =
- {
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
- }
- };
- const int16x4x4_t b20_s16 =
- {
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
- }
- };
+ // Accumulators for the block 0
+ int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+ auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr());
+ auto vec_a_end_addr = vec_a + width_a;
+
+ // This for loop performs 8 accumulations
+ for (; vec_a <= (vec_a_end_addr - 8);)
+ {
+ const int8x8_t a00_s8 = vld1_s8(vec_a);
+ const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
+ const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
+ const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
+ const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
+ const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
+ const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
+ const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
+ const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
+
+ // Convert a00_s8 to int16_t and get the lower part
+ const int16x4x2_t a00_s16 = {{vget_low_s16(vmovl_s8(a00_s8)), vget_high_s16(vmovl_s8(a00_s8))}};
+
+ const int16x4x4_t b00_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+ const int16x4x4_t b10_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))}};
+
+ const int16x4x4_t b20_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))}};
+
+ const int16x4x4_t b30_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))}};
+
+ const int16x4x4_t b40_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))}};
+
+ const int16x4x4_t b50_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))}};
+
+ const int16x4x4_t b60_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))}};
+
+ const int16x4x4_t b70_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))}};
+
+ // Accumulate 0:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
+
+ // Accumulate 1:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
+
+ // Accumulate 2:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
+
+ // Accumulate 3:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
+
+ // Accumulate 4:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
+
+ // Accumulate 5:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
+
+ // Accumulate 6:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
+
+ // Accumulate 7:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
+
+ vec_a += 8;
+ matrix_b += 8 * stride_b;
+ }
- const int16x4x4_t b30_s16 =
+ // This for loop performs the left-over accumulations
+ for (; vec_a < vec_a_end_addr;)
{
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
- }
- };
+ const int8x8_t a00_s8 = vld1_dup_s8(vec_a);
+ const int8x16_t b00_s8 = vld1q_s8(matrix_b);
- const int16x4x4_t b40_s16 =
- {
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
- }
- };
+ const int16x4x4_t b00_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
- const int16x4x4_t b50_s16 =
- {
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
- }
- };
+ // Convert a00_s8 to uint16_t and get the lower part
+ const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
- const int16x4x4_t b60_s16 =
- {
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
- }
- };
-
- const int16x4x4_t b70_s16 =
- {
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
- }
- };
-
- // Accumulate 0:
- c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
-
- // Accumulate 1:
- c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
-
- // Accumulate 2:
- c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
-
- // Accumulate 3:
- c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
-
- // Accumulate 4:
- c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
-
- // Accumulate 5:
- c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
-
- // Accumulate 6:
- c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
-
- // Accumulate 7:
- c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
-
- vec_a += 8;
- matrix_b += 8 * stride_b;
- }
+ // Accumulate 0:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
- // This for loop performs the left-over accumulations
- for(; vec_a < vec_a_end_addr;)
- {
- const int8x8_t a00_s8 = vld1_dup_s8(vec_a);
- const int8x16_t b00_s8 = vld1q_s8(matrix_b);
+ vec_a += 1;
+ matrix_b += stride_b;
+ }
- const int16x4x4_t b00_s16 =
+ auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+ if (id.x() < (width_out - 16))
{
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
- }
- };
-
- // Convert a00_s8 to uint16_t and get the lower part
- const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
- // Accumulate 0:
- c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
- vec_a += 1;
- matrix_b += stride_b;
- }
-
- auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
- if(id.x() < (width_out - 16))
- {
- vst1q_s32(vec_out + 0, c0.val[0]);
- vst1q_s32(vec_out + 4, c0.val[1]);
- vst1q_s32(vec_out + 8, c0.val[2]);
- vst1q_s32(vec_out + 12, c0.val[3]);
- }
- else
- {
- auto left_over = width_out - id.x();
- for(auto k = 0; k < 4 && left_over; ++k)
+ vst1q_s32(vec_out + 0, c0.val[0]);
+ vst1q_s32(vec_out + 4, c0.val[1]);
+ vst1q_s32(vec_out + 8, c0.val[2]);
+ vst1q_s32(vec_out + 12, c0.val[3]);
+ }
+ else
{
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ auto left_over = width_out - id.x();
+ for (auto k = 0; k < 4 && left_over; ++k)
{
- *(vec_out + k * 4 + j) = c0.val[k][j];
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+ {
+ *(vec_out + k * 4 + j) = c0.val[k][j];
+ }
}
}
- }
- },
- ina, inb, out);
+ },
+ ina, inb, out);
}
-void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+void inline matrix_multiply_u8(
+ Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
{
const auto width_out = static_cast<int>(out_info.dimension(0));
const auto height_out = static_cast<int>(out_info.dimension(1));
const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const uint8_t *mtx_a0 = ina.ptr();
- const uint8_t *mtx_b0 = inb.ptr();
-
- // Note: Since the input are all positives, we can use uint32_t
- // Accumulators for the block 0
- uint32x4x4_t c0 =
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- {
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0)
+ const uint8_t *mtx_a0 = ina.ptr();
+ const uint8_t *mtx_b0 = inb.ptr();
+
+ // Note: Since the input are all positives, we can use uint32_t
+ // Accumulators for the block 0
+ uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+ // Accumulators for the block 1
+ uint32x4x4_t c1 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+ // Accumulators for the block 2
+ uint32x4x4_t c2 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+ // Accumulators for the block 3
+ uint32x4x4_t c3 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+ for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+ {
+ const uint8x8_t a00_u8 = vld1_u8(mtx_a0);
+ const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
+
+ // Convert a00_u8 to uint16_t and get the lower part
+ const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+ // Convert b00_s8 to uint16_t
+ const uint16x4x4_t b00_u16 = {
+ {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+ // 4x4 block 0
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+ // 4x4 block 1
+ c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
+ c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
+ c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
+ c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
+
+ // 4x4 block 2
+ c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
+ c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
+ c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
+ c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
+
+ // 4x4 block 3
+ c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
+ c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
+ c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
+ c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
}
- };
- // Accumulators for the block 1
- uint32x4x4_t c1 =
- {
- {
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0)
- }
- };
+ auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
- // Accumulators for the block 2
- uint32x4x4_t c2 =
- {
- {
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0)
- }
- };
-
- // Accumulators for the block 3
- uint32x4x4_t c3 =
- {
- {
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0)
- }
- };
-
- for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
- {
- const uint8x8_t a00_u8 = vld1_u8(mtx_a0);
- const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
-
- // Convert a00_u8 to uint16_t and get the lower part
- const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
- // Convert b00_s8 to uint16_t
- const uint16x4x4_t b00_u16 =
+ if (id.y() < height_out && id.x() < (width_out - 16))
{
+ vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
+ vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
+ vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
+ vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
+ if (id.y() + 1 < height_out)
{
- vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
- }
- };
-
- // 4x4 block 0
- c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
- c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
- c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
- c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
- // 4x4 block 1
- c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
- c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
- c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
- c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
-
- // 4x4 block 2
- c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
- c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
- c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
- c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
-
- // 4x4 block 3
- c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
- c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
- c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
- c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
- }
-
- auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-
- if(id.y() < height_out && id.x() < (width_out - 16))
- {
- vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
- vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
- vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
- vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
- if(id.y() + 1 < height_out)
- {
- vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
- vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
- vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
- vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
- if(id.y() + 2 < height_out)
- {
- vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
- vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
- vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
- vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
- if(id.y() + 3 < height_out)
+ vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
+ vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
+ vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
+ vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
+ if (id.y() + 2 < height_out)
{
- vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
- vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
- vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
- vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+ vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
+ vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
+ vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
+ vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
+ if (id.y() + 3 < height_out)
+ {
+ vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
+ vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
+ vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
+ vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+ }
}
}
}
- }
- else
- {
- const auto left_over_value = width_out - id.x();
- auto left_over = left_over_value;
- for(auto k = 0; k < 4 && left_over; ++k)
+ else
{
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ const auto left_over_value = width_out - id.x();
+ auto left_over = left_over_value;
+ for (auto k = 0; k < 4 && left_over; ++k)
{
- *(mtx_out + k * 4 + j) = c0.val[k][j];
- }
- }
- if(id.y() + 1 < height_out)
- {
- left_over = left_over_value;
- for(auto k = 0; k < 4 && left_over; ++k)
- {
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
{
- *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+ *(mtx_out + k * 4 + j) = c0.val[k][j];
}
}
- if(id.y() + 2 < height_out)
+ if (id.y() + 1 < height_out)
{
left_over = left_over_value;
- for(auto k = 0; k < 4 && left_over; ++k)
+ for (auto k = 0; k < 4 && left_over; ++k)
{
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
{
- *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+ *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
}
}
- if(id.y() + 3 < height_out)
+ if (id.y() + 2 < height_out)
{
left_over = left_over_value;
- for(auto k = 0; k < 4 && left_over; ++k)
+ for (auto k = 0; k < 4 && left_over; ++k)
+ {
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+ {
+ *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+ }
+ }
+ if (id.y() + 3 < height_out)
{
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ left_over = left_over_value;
+ for (auto k = 0; k < 4 && left_over; ++k)
{
- *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+ {
+ *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+ }
}
}
}
}
}
- }
- },
- ina, inb, out);
+ },
+ ina, inb, out);
}
-void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+void inline matrix_multiply_s8(
+ Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
{
const auto width_out = static_cast<int>(out_info.dimension(0));
const auto height_out = static_cast<int>(out_info.dimension(1));
@@ -691,182 +540,148 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int
// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
// All the values needed for computing a single 4x4 block will be read from consecutive memory positions
- execute_window_loop(window, [&](const Coordinates & id)
- {
- auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
- auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
-
- // Note: Since the input are all positives, we can use uint32_t
- // Accumulators for the block 0
- int32x4x4_t c0 =
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- {
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0)
+ auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
+ auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
+
+ // Note: Since the input are all positives, we can use uint32_t
+ // Accumulators for the block 0
+ int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+ // Accumulators for the block 1
+ int32x4x4_t c1 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+ // Accumulators for the block 2
+ int32x4x4_t c2 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+ // Accumulators for the block 3
+ int32x4x4_t c3 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+ for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+ {
+ const int8x8_t a00_s8 = vld1_s8(mtx_a0);
+ const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
+
+ // Convert a00_s8 to uint16_t and get the lower part
+ const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+ // Convert b00_s8 to int16_t
+ const int16x4x4_t b00_s16 = {
+ {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+ // 4x4 block 0
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+ // 4x4 block 1
+ c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
+ c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
+ c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
+ c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
+
+ // 4x4 block 2
+ c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
+ c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
+ c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
+ c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
+
+ // 4x4 block 3
+ c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
+ c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
+ c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
+ c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
}
- };
-
- // Accumulators for the block 1
- int32x4x4_t c1 =
- {
- {
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0)
- }
- };
-
- // Accumulators for the block 2
- int32x4x4_t c2 =
- {
- {
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0)
- }
- };
-
- // Accumulators for the block 3
- int32x4x4_t c3 =
- {
- {
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0),
- vdupq_n_s32(0)
- }
- };
-
- for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
- {
- const int8x8_t a00_s8 = vld1_s8(mtx_a0);
- const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
-
- // Convert a00_s8 to uint16_t and get the lower part
- const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
- // Convert b00_s8 to int16_t
- const int16x4x4_t b00_s16 =
- {
- {
- vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
- vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
- vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
- vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
- }
- };
-
- // 4x4 block 0
- c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
- c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
- c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
- c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
- // 4x4 block 1
- c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
- c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
- c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
- c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
-
- // 4x4 block 2
- c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
- c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
- c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
- c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
-
- // 4x4 block 3
- c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
- c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
- c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
- c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
- }
- auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
- if(id.y() < height_out && id.x() < (width_out - 16))
- {
- vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
- vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
- vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
- vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
- if(id.y() + 1 < height_out)
- {
- vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
- vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
- vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
- vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
- if(id.y() + 2 < height_out)
- {
- vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
- vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
- vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
- vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
- if(id.y() + 3 < height_out)
+ auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
+ if (id.y() < height_out && id.x() < (width_out - 16))
+ {
+ vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
+ vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
+ vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
+ vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
+ if (id.y() + 1 < height_out)
+ {
+ vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
+ vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
+ vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
+ vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
+ if (id.y() + 2 < height_out)
{
- vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
- vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
- vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
- vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+ vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
+ vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
+ vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
+ vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
+ if (id.y() + 3 < height_out)
+ {
+ vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
+ vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
+ vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
+ vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+ }
}
}
}
- }
- else if(id.y() < height_out)
- {
- const auto left_over_value = width_out - id.x();
- auto left_over = left_over_value;
- for(auto k = 0; k < 4 && left_over; ++k)
- {
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
- {
- *(mtx_out + k * 4 + j) = c0.val[k][j];
- }
- }
- if(id.y() + 1 < height_out)
+ else if (id.y() < height_out)
{
- left_over = left_over_value;
- for(auto k = 0; k < 4 && left_over; ++k)
+ const auto left_over_value = width_out - id.x();
+ auto left_over = left_over_value;
+ for (auto k = 0; k < 4 && left_over; ++k)
{
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
{
- *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+ *(mtx_out + k * 4 + j) = c0.val[k][j];
}
}
- if(id.y() + 2 < height_out)
+ if (id.y() + 1 < height_out)
{
left_over = left_over_value;
- for(auto k = 0; k < 4 && left_over; ++k)
+ for (auto k = 0; k < 4 && left_over; ++k)
{
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
{
- *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+ *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
}
}
- if(id.y() + 3 < height_out)
+ if (id.y() + 2 < height_out)
{
left_over = left_over_value;
- for(auto k = 0; k < 4 && left_over; ++k)
+ for (auto k = 0; k < 4 && left_over; ++k)
{
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
{
- *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+ *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+ }
+ }
+ if (id.y() + 3 < height_out)
+ {
+ left_over = left_over_value;
+ for (auto k = 0; k < 4 && left_over; ++k)
+ {
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+ {
+ *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+ }
}
}
}
}
}
- }
-
- },
- ina, inb, out);
+ },
+ ina, inb, out);
}
Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::S8, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+ DataType::U8);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
TensorShape in0_shape = src0->tensor_shape();
@@ -874,9 +689,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
TensorShape out_shape = dst->tensor_shape();
// Check vector-by-matrix case
- if(out_shape[1] == 1)
+ if (out_shape[1] == 1)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1],
+ "The number of input0's columns must be equal to input1's rows");
}
else
{
@@ -884,8 +700,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
in1_shape.collapse(2);
out_shape.collapse(2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2],
+ "Output tensor must have the same number of batches of input0 tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ in1_shape[2] != 1 && in0_shape[2] != in1_shape[2],
+ "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");
}
@@ -909,20 +728,22 @@ void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const I
Window win;
// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
- if((dst->dimension(1) == 1))
+ if ((dst->dimension(1) == 1))
{
// Configure kernel window
win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
}
else
{
- win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win =
+ calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
}
ICpuKernel::configure(win);
}
-Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status
+CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst));
return Status{};
@@ -939,12 +760,13 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
auto dst = tensors.get_tensor(TensorType::ACL_DST);
// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
- if((dst->info()->dimension(1) == 1))
+ if ((dst->info()->dimension(1) == 1))
{
const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0));
const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0));
const auto width_out = static_cast<int>(dst->info()->dimension(0));
- const auto in_b_stride = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
+ const auto in_b_stride =
+ static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
// The implementation computes 16 elements per iteration
const int window_start_x = 16 * info.thread_id;
@@ -963,7 +785,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
Window win_b;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(src1->info()->num_dimensions() >= 3)
+ if (src1->info()->num_dimensions() >= 3)
{
win_b = window;
}
@@ -974,18 +796,20 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
Iterator inb(src1, win_b);
Iterator out(dst, win_out);
- switch(src0->info()->data_type())
+ switch (src0->info()->data_type())
{
case DataType::S8:
case DataType::QASYMM8_SIGNED:
{
- vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
+ vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+ window);
break;
}
case DataType::U8:
case DataType::QASYMM8:
{
- vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
+ vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+ window);
break;
}
default:
@@ -1009,7 +833,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
Window win_b;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(_slide_matrix_b)
+ if (_slide_matrix_b)
{
win_b = window;
}
@@ -1021,7 +845,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
Iterator inb(src1, win_b);
Iterator out(dst, window);
- switch(src0->info()->data_type())
+ switch (src0->info()->data_type())
{
case DataType::S8:
case DataType::QASYMM8_SIGNED:
@@ -1050,4 +874,4 @@ const char *CpuGemmLowpMatrixMultiplyKernel::name() const
}
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
index 2cc789d6d9..439ada1b47 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
@@ -68,11 +68,11 @@ public:
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
- bool _slide_matrix_b{ true };
+ bool _slide_matrix_b{true};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
index 534076b97c..9bd1eae663 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
@@ -26,9 +26,10 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
@@ -38,37 +39,49 @@ namespace kernels
{
namespace
{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status validate_arguments_matrix_a_reduction(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const GEMMLowpReductionKernelInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- if(dst->total_size() > 0)
+ if (dst->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ dst->dimension(0) != src->dimension(1),
+ "Output vector must have length equal to the number of rows of the input matrix");
}
return Status{};
}
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status validate_arguments_matrix_b_reduction(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const GEMMLowpReductionKernelInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- if(dst->total_size() > 0)
+ if (dst->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ dst->dimension(0) != src->dimension(0),
+ "Output vector must have length equal to the number of columns of the input matrix");
}
return Status{};
}
} // namespace
-void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const GEMMLowpReductionKernelInfo &info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -77,7 +90,7 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso
_scalar = info.scalar;
_mul_by_scalar = info.mul_by_scalar;
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::QASYMM8:
_func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
@@ -98,14 +111,18 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso
ICpuKernel::configure(win);
}
-Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const GEMMLowpReductionKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info));
return Status{};
}
template <typename T>
-void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window)
+void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src,
+ ITensor *dst,
+ const arm_compute::Window &window)
{
// Intermediate and final accumulator types
using TIAcc = wrapper::traits::promote_t<T>;
@@ -121,55 +138,58 @@ void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor
Iterator in(src, win_input);
Iterator out(dst, collapsed_window);
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
- TAcc sum_row = 0;
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+ TAcc sum_row = 0;
- const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
+ const T *matrix_a = reinterpret_cast<const T *>(
+ (in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
+ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
#endif /* __arm__ */
- int i = 0;
- // This for loop performs 16 accumulations
- for(; i <= (_k - 16); i += 16)
- {
- const auto a0_d8 = wrapper::vloadq(matrix_a + i);
+ int i = 0;
+ // This for loop performs 16 accumulations
+ for (; i <= (_k - 16); i += 16)
+ {
+ const auto a0_d8 = wrapper::vloadq(matrix_a + i);
- // Partial accumulations in U16
- const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
+ // Partial accumulations in U16
+ const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
- // Accumulate to U32
- vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
- }
+ // Accumulate to U32
+ vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
+ }
- // This for loop performs the leftover accumulations
- for(; i < _k; ++i)
- {
- sum_row += static_cast<TAcc>(matrix_a[i]);
- }
+ // This for loop performs the leftover accumulations
+ for (; i < _k; ++i)
+ {
+ sum_row += static_cast<TAcc>(matrix_a[i]);
+ }
#if defined(__aarch64__)
- // Reduction operation available on 64 bit architectures only
- sum_row += wrapper::vaddv(vsum_row);
+ // Reduction operation available on 64 bit architectures only
+ sum_row += wrapper::vaddv(vsum_row);
#else // __aarch64__
- auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
- tmp = wrapper::vpadd(tmp, tmp);
+ auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+ tmp = wrapper::vpadd(tmp, tmp);
- sum_row += wrapper::vgetlane(tmp, 0);
+ sum_row += wrapper::vgetlane(tmp, 0);
#endif // __aarch64__
- // Multiply by scalar if necessary
- if(_mul_by_scalar)
- {
- sum_row *= _scalar;
- }
+ // Multiply by scalar if necessary
+ if (_mul_by_scalar)
+ {
+ sum_row *= _scalar;
+ }
- *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
- },
- in, out);
+ *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
+ },
+ in, out);
}
void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -189,7 +209,9 @@ const char *CpuGemmLowpMatrixAReductionKernel::name() const
return "CpuGemmLowpMatrixAReductionKernel";
}
-void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const GEMMLowpReductionKernelInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info));
@@ -201,7 +223,7 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso
// Configure kernel window
constexpr unsigned int num_elems_processed_per_iteration = 16;
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::QASYMM8:
_func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
@@ -223,14 +245,19 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso
ICpuKernel::configure(win);
}
-Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const GEMMLowpReductionKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info));
return Status{};
}
template <typename T>
-void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src,
+ ITensor *dst,
+ const Window &window,
+ const ThreadInfo &info)
{
// Intermediate and final accumulator types
using TIAcc = wrapper::traits::promote_t<T>;
@@ -258,121 +285,116 @@ void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor
Iterator inb(src, win_in);
Iterator out(dst, win_out);
- execute_window_loop(win_out, [&](const Coordinates & id)
- {
- if(id.x() > width_matrix_b)
+ execute_window_loop(
+ win_out,
+ [&](const Coordinates &id)
{
- return;
- }
+ if (id.x() > width_matrix_b)
+ {
+ return;
+ }
- // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
- {
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
- };
+ // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+ typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] = {
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})};
- const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
+ const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
- asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
+ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
+ asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
#endif /* __arm__ */
- int i = 0;
- // This for loop performs 4 accumulations
- for(; i <= (_k - 4); i += 4)
- {
- const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
- const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
- const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
- const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
+ int i = 0;
+ // This for loop performs 4 accumulations
+ for (; i <= (_k - 4); i += 4)
+ {
+ const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+ const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+ const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+ const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
#if __arm__
- asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
- asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
- asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
- asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
+ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
+ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
+ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
+ asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
#endif /* __arm__ */
- // Partial accumulation in 16bit
- typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
- {
- wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
- wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
- };
-
- tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
- tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
- tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
- tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
- tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
- tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
- tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
- tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
-
- // Accumulate to 32bit
- sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
- sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
- sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
- sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
-
- matrix_b += 4 * in_b_stride;
- }
-
- // This for loop perfoms the leftover accumulations
- for(; i < _k; ++i)
- {
- const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+ // Partial accumulation in 16bit
+ typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] = {
+ wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})};
+
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+ // Accumulate to 32bit
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
+
+ matrix_b += 4 * in_b_stride;
+ }
- // Convert S8 to S16
- const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
+ // This for loop perfoms the leftover accumulations
+ for (; i < _k; ++i)
{
- wrapper::vmovl(wrapper::vgetlow(b0_b8)),
- wrapper::vmovl(wrapper::vgethigh(b0_b8))
- };
+ const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
- // Accumulate to 32bit
- sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
- sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
- sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
- sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
+ // Convert S8 to S16
+ const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]{
+ wrapper::vmovl(wrapper::vgetlow(b0_b8)), wrapper::vmovl(wrapper::vgethigh(b0_b8))};
- matrix_b += in_b_stride;
- }
+ // Accumulate to 32bit
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
- // Multiply by scalar if necessary
- if(_mul_by_scalar)
- {
- sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
- sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
- sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
- sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
- }
-
- auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
- if(id.x() + 16 < width_matrix_b)
- {
- wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
- wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
- wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
- wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
- }
- else
- {
- auto left_over = width_matrix_b - id.x();
- for(auto k = 0; k < 4 && left_over; ++k)
+ matrix_b += in_b_stride;
+ }
+
+ // Multiply by scalar if necessary
+ if (_mul_by_scalar)
+ {
+ sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
+ sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
+ sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
+ sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
+ }
+
+ auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+ if (id.x() + 16 < width_matrix_b)
+ {
+ wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
+ wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
+ wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
+ wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
+ }
+ else
{
- for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+ auto left_over = width_matrix_b - id.x();
+ for (auto k = 0; k < 4 && left_over; ++k)
{
- *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+ for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+ {
+ *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+ }
}
}
- }
- },
- inb, out);
+ },
+ inb, out);
}
void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -393,4 +415,4 @@ const char *CpuGemmLowpMatrixBReductionKernel::name() const
}
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
index e469629cdb..20ef17e96d 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
@@ -66,7 +66,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
@@ -85,12 +85,14 @@ private:
* @param[out] dst Output tensor
* @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
*/
- using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+ using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src,
+ ITensor *dst,
+ const Window &window);
- CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr };
- int32_t _k{ 0 };
- int32_t _scalar{ 0 };
- bool _mul_by_scalar{ false };
+ CpuGemmLowpMatrixAReductionKernelPtr _func{nullptr};
+ int32_t _k{0};
+ int32_t _scalar{0};
+ bool _mul_by_scalar{false};
};
/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
@@ -124,7 +126,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
@@ -144,12 +146,15 @@ private:
* @param[out] dst Output tensor
* @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
*/
- using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info);
+ using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src,
+ ITensor *dst,
+ const Window &window,
+ const ThreadInfo &info);
- CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr };
- int32_t _k{ 0 };
- int32_t _scalar{ 0 };
- bool _mul_by_scalar{ false };
+ CpuGemmLowpMatrixBReductionKernelPtr _func{nullptr};
+ int32_t _k{0};
+ int32_t _scalar{0};
+ bool _mul_by_scalar{false};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
index a65f1a33de..e290783021 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
@@ -31,6 +31,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -44,32 +45,37 @@ namespace kernels
{
namespace
{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
- int32_t a_offset, int32_t b_offset)
+Status validate_arguments(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ int32_t a_offset,
+ int32_t b_offset)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
// If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
+ if (a_offset != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
}
// If b_offset == 0, vector_sum_row can be a nullptr
- if(b_offset != 0)
+ if (b_offset != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
// Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+ const bool reinterpret_as_3d =
+ mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
// Validate input
- ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+ (mm_result->dimension(1) * mm_result->dimension(2)));
ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
TensorShape output_shape = mm_result->tensor_shape();
- if(output_shape.num_dimensions() > 1)
+ if (output_shape.num_dimensions() > 1)
{
const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
@@ -80,13 +86,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
"mm_result tensor must have the same number of batches of output tensor");
- if(a_offset != 0)
+ if (a_offset != 0)
{
TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
vector_sum_col_shape.collapse_from(1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+ vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of "
+ "vector_sum_row_shape or the number of batches must be set to 1");
}
}
}
@@ -94,9 +102,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
return Status{};
}
-void run_offset_contribution(const Window &window,
- ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
- int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d)
+void run_offset_contribution(const Window &window,
+ ITensor *mm_result,
+ const ITensor *vector_sum_col,
+ const ITensor *vector_sum_row,
+ int32_t a_offset,
+ int32_t b_offset,
+ int32_t k_offset,
+ bool slide_vector_sum_col,
+ bool is_gemm3d)
{
Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -112,7 +126,7 @@ void run_offset_contribution(const Window &window,
const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0;
Iterator mm_result_it(mm_result, collapsed_window);
- if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
+ if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
{
// Set window for vector_sum_col
Window win_vector_sum_col(collapsed_window);
@@ -131,95 +145,85 @@ void run_offset_contribution(const Window &window,
const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
// Offset in case vector_sum_col is batched
- const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const size_t batch_offset_col = batch_id * (sum_col_stride_y );
- auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset);
- auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
- // Compute the leftover term due to b_offset.
- int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
- b_offset_term_s32 *= b_offset;
+ const int vector_sum_col_batch_offset =
+ slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
- const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
{
- // Compute the leftover term due to a_offset.
- int32x4x4_t a_offset_term_s32 =
- {
- {
- vld1q_s32(vector_sum_col_ptr + x + 0),
- vld1q_s32(vector_sum_col_ptr + x + 4),
- vld1q_s32(vector_sum_col_ptr + x + 8),
- vld1q_s32(vector_sum_col_ptr + x + 12)
- }
- };
-
- a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
- a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
- a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
- a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
- // Add a_offset_term_s32 and b_offset_term_s32
- int32x4x4_t offset_term_s32 =
+ const int batch_id = id.z() / depth_input;
+ const size_t batch_offset_col = batch_id * (sum_col_stride_y);
+ auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+ batch_id * vector_sum_col_batch_offset);
+ auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+ // Compute the leftover term due to b_offset.
+ int32_t b_offset_term_s32 =
+ *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+ id.y() + (id.z() % depth_input) * height_input);
+ b_offset_term_s32 *= b_offset;
+
+ const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vdupq_n_s32(k_offset),
- vdupq_n_s32(k_offset),
- vdupq_n_s32(k_offset),
- vdupq_n_s32(k_offset)
- }
- };
-
- offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
- offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
- offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
- offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
-
- int32x4x4_t in_s32 =
+ // Compute the leftover term due to a_offset.
+ int32x4x4_t a_offset_term_s32 = {
+ {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+ vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+ a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+ a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+ a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+ a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+ // Add a_offset_term_s32 and b_offset_term_s32
+ int32x4x4_t offset_term_s32 = {
+ {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
+
+ offset_term_s32.val[0] =
+ vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
+ offset_term_s32.val[1] =
+ vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
+ offset_term_s32.val[2] =
+ vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
+ offset_term_s32.val[3] =
+ vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
+
+ int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+ vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+ // Store the result with the offset contribution
+ vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+ vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+ vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+ vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+ }
+
+ // Left-overs loop
+ for (; x < window_end_x; ++x)
{
- {
- vld1q_s32(mm_result_ptr + x + 0),
- vld1q_s32(mm_result_ptr + x + 4),
- vld1q_s32(mm_result_ptr + x + 8),
- vld1q_s32(mm_result_ptr + x + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
- // Store the result with the offset contribution
- vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
- vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
- vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
- vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
- }
+ // Compute the leftover term due to a_offset.
+ int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Compute the leftover term due to a_offset.
- int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
- a_offset_term_s32 *= a_offset;
+ a_offset_term_s32 *= a_offset;
- // Add the offset terms to GEMM's result
- // Store the result with the offset contribution
- mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
- }
- },
- vector_sum_col_it, vector_sum_row_it, mm_result_it);
+ // Add the offset terms to GEMM's result
+ // Store the result with the offset contribution
+ mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
+ }
+ },
+ vector_sum_col_it, vector_sum_row_it, mm_result_it);
}
- else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
+ else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
{
ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
@@ -233,54 +237,51 @@ void run_offset_contribution(const Window &window,
const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ const int batch_id = id.z() / depth_input;
+ auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
- // Compute the leftover term due to b_offset.
- int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
- b_offset_term_s32 *= b_offset;
+ // Compute the leftover term due to b_offset.
+ int32_t b_offset_term_s32 =
+ *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+ id.y() + (id.z() % depth_input) * height_input);
+ b_offset_term_s32 *= b_offset;
- const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+ const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- int32x4x4_t in_s32 =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(mm_result_ptr + x + 0),
- vld1q_s32(mm_result_ptr + x + 4),
- vld1q_s32(mm_result_ptr + x + 8),
- vld1q_s32(mm_result_ptr + x + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
-
- // Store the result with the offset contribution
- vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
- vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
- vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
- vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
- }
-
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Add the offset terms to GEMM's result
- // Store the result with the offset contribution
- mm_result_ptr[x] += b_offset_term_s32;
- }
- },
- vector_sum_row_it, mm_result_it);
+ int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+ vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
+
+ // Store the result with the offset contribution
+ vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+ vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+ vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+ vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+ }
+
+ // Left-overs loop
+ for (; x < window_end_x; ++x)
+ {
+ // Add the offset terms to GEMM's result
+ // Store the result with the offset contribution
+ mm_result_ptr[x] += b_offset_term_s32;
+ }
+ },
+ vector_sum_row_it, mm_result_it);
}
- else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
+ else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
{
// Set window for vector_sum_col
Window win_vector_sum_col(collapsed_window);
@@ -290,69 +291,62 @@ void run_offset_contribution(const Window &window,
Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
// Offset in case vector_sum_col is batched
- const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const size_t batch_offset_col = batch_id * (sum_col_stride_y ); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
- auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset);
- auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+ const int vector_sum_col_batch_offset =
+ slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
{
- // Compute the leftover term due to a_offset.
- int32x4x4_t a_offset_term_s32 =
+ const int batch_id = id.z() / depth_input;
+ const size_t batch_offset_col =
+ batch_id *
+ (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
+ auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+ batch_id * vector_sum_col_batch_offset);
+ auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(vector_sum_col_ptr + x + 0),
- vld1q_s32(vector_sum_col_ptr + x + 4),
- vld1q_s32(vector_sum_col_ptr + x + 8),
- vld1q_s32(vector_sum_col_ptr + x + 12)
- }
- };
-
- a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
- a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
- a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
- a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
- int32x4x4_t in_s32 =
+ // Compute the leftover term due to a_offset.
+ int32x4x4_t a_offset_term_s32 = {
+ {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+ vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+ a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+ a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+ a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+ a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+ int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+ vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+ // Store the result with the offset contribution
+ vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+ vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+ vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+ vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+ }
+
+ // Left-overs loop
+ for (; x < window_end_x; ++x)
{
- {
- vld1q_s32(mm_result_ptr + x + 0),
- vld1q_s32(mm_result_ptr + x + 4),
- vld1q_s32(mm_result_ptr + x + 8),
- vld1q_s32(mm_result_ptr + x + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
- // Store the result with the offset contribution
- vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
- vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
- vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
- vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
- }
-
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Compute the leftover term due to a_offset.
- const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
- // Add the offset terms to GEMM's result
- // Store the result with the offset contribution
- mm_result_ptr[x] += a_offset_term_s32 * a_offset;
- }
- },
- vector_sum_col_it, mm_result_it);
+ // Compute the leftover term due to a_offset.
+ const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+ // Add the offset terms to GEMM's result
+ // Store the result with the offset contribution
+ mm_result_ptr[x] += a_offset_term_s32 * a_offset;
+ }
+ },
+ vector_sum_col_it, mm_result_it);
}
else // false, false
{
@@ -362,7 +356,12 @@ void run_offset_contribution(const Window &window,
}
} // namespace
-void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result,
+ ITensorInfo *vector_sum_col,
+ ITensorInfo *vector_sum_row,
+ int32_t k,
+ int32_t a_offset,
+ int32_t b_offset)
{
// Perform validate step
ARM_COMPUTE_UNUSED(vector_sum_row);
@@ -374,7 +373,7 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen
_k_offset = a_offset * b_offset * k;
// If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
+ if (a_offset != 0)
{
// Check if vector_sum_col_shape should be slidden or not
// Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
@@ -387,8 +386,11 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen
ICpuKernel::configure(win);
}
-Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
- int32_t a_offset, int32_t b_offset)
+Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ int32_t a_offset,
+ int32_t b_offset)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
return Status{};
@@ -405,11 +407,11 @@ void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Win
auto mm_result = tensors.get_tensor(TensorType::ACL_DST);
// Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = vector_sum_row != nullptr
- && mm_result->info()->num_dimensions() > 1
- && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+ const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+ mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
- run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
+ run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset,
+ _slide_vector_sum_col, reinterpret_as_3d);
}
const char *CpuGemmLowpOffsetContributionKernel::name() const
@@ -418,4 +420,4 @@ const char *CpuGemmLowpOffsetContributionKernel::name() const
}
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
index 3514ca811d..08b2d47529 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
@@ -63,24 +63,33 @@ public:
* @param[in] a_offset Offset to be added to each element of the matrix A.
* @param[in] b_offset Offset to be added to each element of the matrix B.
*/
- void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+ void configure(ITensorInfo *mm_result,
+ ITensorInfo *vector_sum_col,
+ ITensorInfo *vector_sum_row,
+ int32_t k,
+ int32_t a_offset,
+ int32_t b_offset);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuGemmLowpOffsetContributionKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
+ static Status validate(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ int32_t a_offset,
+ int32_t b_offset);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
- int32_t _a_offset{ 0 };
- int32_t _b_offset{ 0 };
- int32_t _k_offset{ 0 };
- bool _slide_vector_sum_col{ true };
+ int32_t _a_offset{0};
+ int32_t _b_offset{0};
+ int32_t _k_offset{0};
+ bool _slide_vector_sum_col{true};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
index 190487eced..d008842398 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -31,10 +31,11 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -48,80 +49,38 @@ namespace
{
inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
{
- return
- {
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)
- }
- };
+ return {{vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)}};
}
inline int32x4x4_t load(const int32_t *ptr, int32_t x)
{
- return
- {
- {
- vld1q_s32(ptr + x + 0),
- vld1q_s32(ptr + x + 4),
- vld1q_s32(ptr + x + 8),
- vld1q_s32(ptr + x + 12)
- }
- };
+ return {{vld1q_s32(ptr + x + 0), vld1q_s32(ptr + x + 4), vld1q_s32(ptr + x + 8), vld1q_s32(ptr + x + 12)}};
}
inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
{
- return
- {
- {
- vaddq_s32(a.val[0], b),
- vaddq_s32(a.val[1], b),
- vaddq_s32(a.val[2], b),
- vaddq_s32(a.val[3], b)
- }
- };
+ return {{vaddq_s32(a.val[0], b), vaddq_s32(a.val[1], b), vaddq_s32(a.val[2], b), vaddq_s32(a.val[3], b)}};
}
inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
{
- return
- {
- {
- vaddq_s32(a.val[0], b.val[0]),
- vaddq_s32(a.val[1], b.val[1]),
- vaddq_s32(a.val[2], b.val[2]),
- vaddq_s32(a.val[3], b.val[3])
- }
- };
+ return {{vaddq_s32(a.val[0], b.val[0]), vaddq_s32(a.val[1], b.val[1]), vaddq_s32(a.val[2], b.val[2]),
+ vaddq_s32(a.val[3], b.val[3])}};
}
inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
{
- return
- {
- {
- vmulq_n_s32(a.val[0], mul_scalar),
- vmulq_n_s32(a.val[1], mul_scalar),
- vmulq_n_s32(a.val[2], mul_scalar),
- vmulq_n_s32(a.val[3], mul_scalar)
- }
- };
+ return {{vmulq_n_s32(a.val[0], mul_scalar), vmulq_n_s32(a.val[1], mul_scalar), vmulq_n_s32(a.val[2], mul_scalar),
+ vmulq_n_s32(a.val[3], mul_scalar)}};
}
inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
{
- return
- {
- {
- vmulq_s32(a.val[0], vld1q_s32(multilpier)),
- vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
- vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
- vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
- }
- };
+ return {{vmulq_s32(a.val[0], vld1q_s32(multilpier)), vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
+ vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))}};
}
inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
@@ -144,18 +103,11 @@ inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offse
inline int32x4x4_t get_k_offset(int32_t k_offset)
{
- return
- {
- {
- vdupq_n_s32(k_offset),
- vdupq_n_s32(k_offset),
- vdupq_n_s32(k_offset),
- vdupq_n_s32(k_offset)
- }
- };
+ return {{vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
}
-inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
+inline uint8x16_t finalize_quantization_floating_point(
+ int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
{
const static int32x4_t zero_s32 = vdupq_n_s32(0);
@@ -172,18 +124,13 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
// Convert S32 to S16
- const int16x8x2_t in_s16 =
- {
- {
- vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
- vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
- }
- };
+ const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
// Convert S16 to U8
uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_u8 = vmaxq_u8(out_u8, min_u8);
out_u8 = vminq_u8(out_u8, max_u8);
@@ -192,7 +139,8 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
return out_u8;
}
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+inline int8x16_t finalize_quantization_floating_point(
+ int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
{
const static int32x4_t zero_s32 = vdupq_n_s32(0);
@@ -209,18 +157,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
// Convert S32 to S16
- const int16x8x2_t in_s16 =
- {
- {
- vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
- vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
- }
- };
+ const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
// Convert S16 to S8
int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_s8 = vmaxq_s8(out_s8, min_s8);
out_s8 = vminq_s8(out_s8, max_s8);
@@ -229,7 +172,8 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
return out_s8;
}
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+inline int8x16_t finalize_quantization_floating_point(
+ int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
{
const static int32x4_t zero_s32 = vdupq_n_s32(0);
@@ -246,18 +190,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
// Convert S32 to S16
- const int16x8x2_t in_s16 =
- {
- {
- vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
- vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
- }
- };
+ const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
// Convert S16 to S8
int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
out_s8 = vmaxq_s8(out_s8, min_s8);
out_s8 = vminq_s8(out_s8, max_s8);
@@ -305,81 +244,103 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias)
}
template <typename VT>
-inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
- const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
- typename VT::vtype min_vec, typename VT::vtype max_vec,
- int32_t a_offset, int32_t b_offset, int32_t k_offset,
- int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
- int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
+inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr,
+ const int32_t *vector_sum_row_ptr,
+ const int32_t *bias_ptr,
+ Iterator mm_result_it,
+ Iterator out_it,
+ const int32x4_t result_offset_s32,
+ const int32x4_t result_shift_s32,
+ typename VT::vtype min_vec,
+ typename VT::vtype max_vec,
+ int32_t a_offset,
+ int32_t b_offset,
+ int32_t k_offset,
+ int32_t multiplier,
+ int32_t shift,
+ int32_t offset,
+ int32_t min_bound,
+ int32_t max_bound,
+ int window_step_x,
+ int window_start_x,
+ int window_end_x,
+ bool has_a_offset,
+ bool has_b_offset,
+ bool has_bias,
+ bool is_bounded_relu,
+ bool is_fixed_point)
{
- int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
- if(!is_fixed_point)
+ int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+ if (!is_fixed_point)
{
// Combine quantization offset with other offsets.
offset_term_s32 = add_s32(offset_term_s32, result_offset_s32);
}
- if(has_a_offset && has_b_offset)
+ if (has_a_offset && has_b_offset)
{
offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset));
}
- if(has_b_offset)
+ if (has_b_offset)
{
offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset));
}
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
- if(has_a_offset)
+ if (has_a_offset)
{
in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
}
- if(has_bias)
+ if (has_bias)
{
in_s32 = add_s32(in_s32, load(bias_ptr, x));
}
- if(!is_fixed_point || has_b_offset)
+ if (!is_fixed_point || has_b_offset)
{
in_s32 = add_s32(in_s32, offset_term_s32);
}
- if(!is_fixed_point)
+ if (!is_fixed_point)
{
in_s32 = mul_s32(in_s32, multiplier);
}
- if(is_fixed_point)
+ if (is_fixed_point)
{
- wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
- finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
+ wrapper::vstore(
+ reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+ finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
}
else
{
- wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
- finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
+ wrapper::vstore(
+ reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+ finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
}
}
// Compute left-over elements
- for(; x < window_end_x; ++x)
+ for (; x < window_end_x; ++x)
{
- int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+ int32_t in_value =
+ *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
- if(has_a_offset)
+ if (has_a_offset)
{
in_value += (*(vector_sum_col_ptr + x) * a_offset);
}
- if(has_bias)
+ if (has_bias)
{
in_value += *(bias_ptr + x);
}
- if(is_fixed_point)
+ if (is_fixed_point)
{
// Finalize and store the result
- *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset,
- static_cast<typename VT::stype>(min_bound),
- static_cast<typename VT::stype>(max_bound), is_bounded_relu);
+ *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+ finalize_quantization(in_value, multiplier, shift, offset, static_cast<typename VT::stype>(min_bound),
+ static_cast<typename VT::stype>(max_bound), is_bounded_relu);
}
else
{
@@ -387,75 +348,100 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
in_value = (in_value * multiplier) >> shift;
// Bound and store the result
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
- in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+ in_value = static_cast<typename VT::stype>(
+ std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
}
- *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
- std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
+ *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+ static_cast<typename VT::stype>(std::max<int32_t>(
+ static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
+ std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
}
}
}
-inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
- const int32_t *result_multipliers, const int32_t *result_shifts,
- const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
- int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
- int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
+inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr,
+ const int32_t *bias_ptr,
+ Iterator mm_result_it,
+ Iterator out_it,
+ const int32_t *result_multipliers,
+ const int32_t *result_shifts,
+ const int32x4_t result_offset,
+ int8x16_t min_s8,
+ int8x16_t max_s8,
+ int32_t a_offset,
+ int32_t offset,
+ int32_t min_bound,
+ int32_t max_bound,
+ int window_step_x,
+ int window_start_x,
+ int window_end_x,
+ bool has_a_offset,
+ bool has_bias,
+ bool is_bounded_relu,
+ bool is_fixed_point)
{
- int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
- if(!is_fixed_point)
+ int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+ if (!is_fixed_point)
{
// Combine quantization offset with other offsets.
offset_term_s32 = add_s32(offset_term_s32, result_offset);
}
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
- if(has_a_offset)
+ if (has_a_offset)
{
in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
}
- if(has_bias)
+ if (has_bias)
{
in_s32 = add_s32(in_s32, load(bias_ptr, x));
}
- if(!is_fixed_point)
+ if (!is_fixed_point)
{
in_s32 = add_s32(in_s32, offset_term_s32);
in_s32 = mul_s32(in_s32, result_multipliers + x);
}
- if(is_fixed_point)
+ if (is_fixed_point)
{
- vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu));
+ vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x),
+ finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x),
+ result_offset, min_s8, max_s8, is_bounded_relu));
}
else
{
- vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
+ vst1q_s8(
+ reinterpret_cast<int8_t *>(out_it.ptr() + x),
+ finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
}
}
// Compute left-over elements
- for(; x < window_end_x; ++x)
+ for (; x < window_end_x; ++x)
{
- int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+ int32_t in_value =
+ *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
- if(has_a_offset)
+ if (has_a_offset)
{
in_value += (*(vector_sum_col_ptr + x) * a_offset);
}
- if(has_bias)
+ if (has_bias)
{
in_value += *(bias_ptr + x);
}
- if(is_fixed_point)
+ if (is_fixed_point)
{
// Finalize and store the result
- *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
+ *(out_it.ptr() + x) =
+ finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset,
+ static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
}
else
{
@@ -463,7 +449,7 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
// Bound and store the result
- if(is_bounded_relu)
+ if (is_bounded_relu)
{
in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
}
@@ -473,10 +459,20 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
}
template <typename T>
-void run_offset_contribution_output_stage(const Window &window,
- const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
- int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched,
- GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
+void run_offset_contribution_output_stage(const Window &window,
+ const ITensor *mm_result,
+ const ITensor *vector_sum_col,
+ const ITensor *vector_sum_row,
+ const ITensor *bias,
+ ITensor *output,
+ int32_t a_offset,
+ int32_t b_offset,
+ int32_t k_offset,
+ bool is_vector_sum_col_batched,
+ GEMMLowpOutputStageInfo output_stage,
+ bool is_gemm3d,
+ bool is_bounded_relu,
+ bool is_fixed_point)
{
// Semantics of XYZW Explained for each tensor
//
@@ -516,7 +512,7 @@ void run_offset_contribution_output_stage(const Window &window,
Iterator mm_result_it(mm_result, win);
Iterator out_it(output, win);
- if((a_offset != 0) && (b_offset != 0))
+ if ((a_offset != 0) && (b_offset != 0))
{
ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
@@ -527,45 +523,52 @@ void run_offset_contribution_output_stage(const Window &window,
const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
// Offset in case vector_sum_col is batched in y dimension
- const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+ const int vector_sum_col_stride_batch =
+ is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
- if(bias != nullptr)
+ if (bias != nullptr)
{
Iterator bias_it = get_bias_it(collapsed_window, bias);
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
- const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
- + id.y() + (id.z() % depth_input) * height_input;
- run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
- mm_result_it,
- out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
- },
- vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+ vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+ const auto vector_sum_row_ptr =
+ reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+ id.y() + (id.z() % depth_input) * height_input;
+ run_offset_contribution_output_stage_window<Typer>(
+ vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
+ mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset,
+ k_offset, multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x,
+ window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
+ },
+ vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
}
else
{
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
- const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
- + id.y() + (id.z() % depth_input) * height_input;
- run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point);
- },
- vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+ vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+ const auto vector_sum_row_ptr =
+ reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+ id.y() + (id.z() % depth_input) * height_input;
+ run_offset_contribution_output_stage_window<Typer>(
+ vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32,
+ result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset,
+ min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, false,
+ is_bounded_relu, is_fixed_point);
+ },
+ vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
}
}
- else if((a_offset == 0) && (b_offset != 0))
+ else if ((a_offset == 0) && (b_offset != 0))
{
ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
@@ -573,114 +576,139 @@ void run_offset_contribution_output_stage(const Window &window,
const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
- if(bias != nullptr)
+ if (bias != nullptr)
{
Iterator bias_it = get_bias_it(collapsed_window, bias);
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
- + id.y() + (id.z() % depth_input) * height_input;
- run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
- out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point);
- },
- vector_sum_row_it, bias_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_row_ptr =
+ reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+ id.y() + (id.z() % depth_input) * height_input;
+ run_offset_contribution_output_stage_window<Typer>(
+ nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+ out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+ false, true, true, is_bounded_relu, is_fixed_point);
+ },
+ vector_sum_row_it, bias_it, mm_result_it, out_it);
}
else
{
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
- + id.y() + (id.z() % depth_input) * height_input;
- run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point);
- },
- vector_sum_row_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_row_ptr =
+ reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+ id.y() + (id.z() % depth_input) * height_input;
+ run_offset_contribution_output_stage_window<Typer>(
+ nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu,
+ is_fixed_point);
+ },
+ vector_sum_row_it, mm_result_it, out_it);
}
}
- else if((a_offset != 0) && (b_offset == 0))
+ else if ((a_offset != 0) && (b_offset == 0))
{
ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
// Offset in case vector_sum_col is batched in y dimension
- const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+ const int vector_sum_col_stride_batch =
+ is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
- if(bias != nullptr)
+ if (bias != nullptr)
{
Iterator bias_it = get_bias_it(collapsed_window, bias);
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
- run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
- out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point);
- },
- vector_sum_col_it, bias_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+ vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+ run_offset_contribution_output_stage_window<Typer>(
+ vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+ out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+ multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+ true, false, true, is_bounded_relu, is_fixed_point);
+ },
+ vector_sum_col_it, bias_it, mm_result_it, out_it);
}
else
{
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
- run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point);
- },
- vector_sum_col_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+ vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+ run_offset_contribution_output_stage_window<Typer>(
+ vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+ min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu,
+ is_fixed_point);
+ },
+ vector_sum_col_it, mm_result_it, out_it);
}
}
else
{
- if(bias != nullptr)
+ if (bias != nullptr)
{
Iterator bias_it = get_bias_it(collapsed_window, bias);
- execute_window_loop(collapsed_window, [&](const Coordinates &)
- {
- run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point);
- },
- bias_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window<Typer>(
+ nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier,
+ shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, false,
+ true, is_bounded_relu, is_fixed_point);
+ },
+ bias_it, mm_result_it, out_it);
}
else
{
- execute_window_loop(collapsed_window, [&](const Coordinates &)
- {
- run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it,
- result_offset_s32, result_shift_s32,
- min_vec, max_vec, a_offset, b_offset, k_offset,
- multiplier, shift, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point);
- },
- mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window<Typer>(
+ nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec,
+ max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu,
+ is_fixed_point);
+ },
+ mm_result_it, out_it);
}
return;
}
}
-void run_offset_contribution_output_stage_symm(const Window &window,
- const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
- int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched,
- GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
+void run_offset_contribution_output_stage_symm(const Window &window,
+ const ITensor *mm_result,
+ const ITensor *vector_sum_col,
+ const ITensor *vector_sum_row,
+ const ITensor *bias,
+ ITensor *output,
+ int32_t a_offset,
+ int32_t b_offset,
+ int32_t k_offset,
+ bool is_vector_sum_col_batched,
+ GEMMLowpOutputStageInfo output_stage,
+ bool is_gemm3d,
+ bool is_bounded_relu,
+ bool is_fixed_point)
{
ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
@@ -690,8 +718,8 @@ void run_offset_contribution_output_stage_symm(const Window &window,
const int32_t min_bound = output_stage.gemmlowp_min_bound;
const int32_t max_bound = output_stage.gemmlowp_max_bound;
- const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
- const int32_t *result_shifts = output_stage.gemmlowp_shifts.data();
+ const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
+ const int32_t *result_shifts = output_stage.gemmlowp_shifts.data();
const int32x4_t result_offset_s32 = vdupq_n_s32(offset);
const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(min_bound));
const int8x16_t max_s8 = vdupq_n_s8(static_cast<int8_t>(max_bound));
@@ -708,88 +736,105 @@ void run_offset_contribution_output_stage_symm(const Window &window,
Iterator mm_result_it(mm_result, win);
Iterator out_it(output, win);
- if(a_offset != 0)
+ if (a_offset != 0)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
// Offset in case vector_sum_col is batched in y dimension
- const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+ const int vector_sum_col_stride_batch =
+ is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
- if(bias != nullptr)
+ if (bias != nullptr)
{
Iterator bias_it = get_bias_it(collapsed_window, bias);
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
- run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
- result_multipliers, result_shifts,
- result_offset_s32, min_s8, max_s8,
- a_offset, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point);
- },
- vector_sum_col_it, bias_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+ vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+ run_offset_contribution_output_stage_window_symm(
+ vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+ min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu,
+ is_fixed_point);
+ },
+ vector_sum_col_it, bias_it, mm_result_it, out_it);
}
else
{
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
- run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
- result_multipliers, result_shifts,
- result_offset_s32, min_s8, max_s8,
- a_offset, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
- },
- vector_sum_col_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+ vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+ run_offset_contribution_output_stage_window_symm(
+ vector_sum_col_ptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x,
+ window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
+ },
+ vector_sum_col_it, mm_result_it, out_it);
}
}
else
{
- if(bias != nullptr)
+ if (bias != nullptr)
{
Iterator bias_it = get_bias_it(collapsed_window, bias);
- execute_window_loop(collapsed_window, [&](const Coordinates &)
- {
- run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
- result_multipliers, result_shifts,
- result_offset_s32, min_s8, max_s8,
- a_offset, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point);
- },
- bias_it, mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window_symm(
+ nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+ min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu,
+ is_fixed_point);
+ },
+ bias_it, mm_result_it, out_it);
}
else
{
- execute_window_loop(collapsed_window, [&](const Coordinates &)
- {
- run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it,
- result_multipliers, result_shifts,
- result_offset_s32, min_s8, max_s8,
- a_offset, offset, min_bound, max_bound,
- window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point);
- },
- mm_result_it, out_it);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window_symm(
+ nullptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, result_offset_s32,
+ min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, window_start_x,
+ window_end_x, false, false, is_bounded_relu, is_fixed_point);
+ },
+ mm_result_it, out_it);
}
return;
}
}
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
- int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+Status validate_arguments(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ int32_t a_offset,
+ int32_t b_offset,
+ GEMMLowpOutputStageInfo output_stage)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
- if(output->data_type() != DataType::QASYMM8)
+ if (output->data_type() != DataType::QASYMM8)
{
- ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 &&
+ b_offset != 0);
}
ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN &&
+ output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
- if(bias != nullptr)
+ if (bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -797,7 +842,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
}
// If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
+ if (a_offset != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
@@ -805,19 +850,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
}
// If b_offset == 0, vector_sum_row can be a nullptr
- if(b_offset != 0)
+ if (b_offset != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
// Check if input is a 3D reinterpretation
- const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+ const bool reinterpret_as_3d =
+ mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
// Validate input
- ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+ ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+ (mm_result->dimension(1) * mm_result->dimension(2)));
ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
TensorShape output_shape = output->tensor_shape();
- if(output_shape.num_dimensions() > 1)
+ if (output_shape.num_dimensions() > 1)
{
const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
@@ -828,13 +875,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
"mm_result tensor must have the same number of batches of output tensor");
- if(a_offset != 0)
+ if (a_offset != 0)
{
TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
vector_sum_col_shape.collapse_from(1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+ vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of "
+ "vector_sum_row_shape or the number of batches must be set to 1");
}
}
@@ -842,7 +891,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->num_dimensions() > 3);
}
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
@@ -852,15 +901,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
}
} // namespace
-void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
- const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
- int32_t k, int32_t a_offset, int32_t b_offset,
+void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ int32_t k,
+ int32_t a_offset,
+ int32_t b_offset,
GEMMLowpOutputStageInfo output_stage)
{
ARM_COMPUTE_UNUSED(vector_sum_row, bias);
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
_a_offset = a_offset;
_b_offset = b_offset;
@@ -868,7 +923,7 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo
_output_stage = output_stage;
// If a_offset == 0, vector_sum_col can be a nullptr
- if(a_offset != 0)
+ if (a_offset != 0)
{
// Check if vector_sum_col_shape should be slidden or not
// Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
@@ -888,16 +943,24 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo
ICpuKernel::configure(win);
}
-Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
- const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
- int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ int32_t a_offset,
+ int32_t b_offset,
+ GEMMLowpOutputStageInfo output_stage)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
return Status{};
}
-void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -912,14 +975,14 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors
PixelValue type_min{};
PixelValue type_max{};
std::tie(type_min, type_max) = get_min_max(dst->info()->data_type());
- int32_t type_min_int = type_min.get<int32_t>();
- int32_t type_max_int = type_max.get<int32_t>();
+ int32_t type_min_int = type_min.get<int32_t>();
+ int32_t type_max_int = type_max.get<int32_t>();
- const bool reinterpret_as_3d = vector_sum_row != nullptr
- && mm_result->info()->num_dimensions() > 1
- && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+ const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+ mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
- const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
+ const bool is_bounded_relu =
+ !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
// Check if we need to perform fixed point requantization
const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
@@ -930,22 +993,25 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors
// Check if symmetric per-channel execution
const bool is_symm = _output_stage.is_quantized_per_channel;
- if(is_symm)
+ if (is_symm)
{
- run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
- reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+ run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst,
+ _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched,
+ _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
}
else
{
- if(is_signed)
+ if (is_signed)
{
- run_offset_contribution_output_stage<int8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
- reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+ run_offset_contribution_output_stage<int8_t>(
+ window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset,
+ _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
}
else
{
- run_offset_contribution_output_stage<uint8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
- reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+ run_offset_contribution_output_stage<uint8_t>(
+ window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset,
+ _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
}
}
}
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
index 3cb99faee8..af477d4756 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -85,7 +86,13 @@ public:
* @param[in] b_offset Offset to be added to each element of the matrix B.
* @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
*/
- void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset,
+ void configure(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ int32_t k,
+ int32_t a_offset,
int32_t b_offset,
GEMMLowpOutputStageInfo output_stage);
/** Static function to check if given info will lead to a valid configuration
@@ -94,21 +101,26 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
+ static Status validate(const ITensorInfo *mm_result,
+ const ITensorInfo *vector_sum_col,
+ const ITensorInfo *vector_sum_row,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ int32_t a_offset,
int32_t b_offset,
GEMMLowpOutputStageInfo output_stage);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
/** Function to use for the particular tensors passed to configure() */
- int32_t _a_offset{ 0 };
- int32_t _b_offset{ 0 };
- int32_t _k_offset{ 0 };
- bool _is_vector_sum_col_batched{ true };
- GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() };
+ int32_t _a_offset{0};
+ int32_t _b_offset{0};
+ int32_t _k_offset{0};
+ bool _is_vector_sum_col_batched{true};
+ GEMMLowpOutputStageInfo _output_stage{GEMMLowpOutputStageInfo()};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
index 3023d93113..eefc294700 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -28,13 +28,14 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
#include "src/core/AccessWindowStatic.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -46,26 +47,35 @@ namespace kernels
{
namespace
{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *output_stage)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
- || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ output_stage->gemmlowp_max_bound >
+ std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ output_stage->gemmlowp_min_bound <
+ std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+ output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
// Check biases if exist
- if(bias != nullptr)
+ if (bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
}
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
- if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED))
+ if (dst->data_type() != output_stage->output_data_type &&
+ (output_stage->output_data_type == DataType::QASYMM8 ||
+ output_stage->output_data_type == DataType::QASYMM8_SIGNED))
{
ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types");
}
@@ -92,24 +102,26 @@ inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_
}
template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value,
- typename wrapper::traits::neon_vector<T, 16>::type>::type
- convert_to_8bit(const int16x8x2_t in_s16)
+inline
+ typename std::enable_if<std::is_same<T, uint8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+ convert_to_8bit(const int16x8x2_t in_s16)
{
return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1]));
}
template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value,
- typename wrapper::traits::neon_vector<T, 16>::type>::type
- convert_to_8bit(const int16x8x2_t in_s16)
+inline typename std::enable_if<std::is_same<T, int8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+convert_to_8bit(const int16x8x2_t in_s16)
{
return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1]));
}
template <typename T>
-inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min,
- typename wrapper::traits::neon_vector<T, 16>::type max)
+inline typename wrapper::traits::neon_vector<T, 16>::type
+finalize_quantization(int32x4x4_t &in_s32,
+ int32x4_t result_shift_s32,
+ typename wrapper::traits::neon_vector<T, 16>::type min,
+ typename wrapper::traits::neon_vector<T, 16>::type max)
{
// Shift final result (negative value shift right)
in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
@@ -118,13 +130,8 @@ inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(
in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
// Convert S32 to S16
- const int16x8x2_t in_s16 =
- {
- {
- vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
- vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
- }
- };
+ const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
// Convert S16 to S8 or U8
typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16);
@@ -137,7 +144,10 @@ inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(
} // namespace
template <typename T>
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window)
{
using VectorType = typename wrapper::traits::neon_vector<T, 16>::type;
@@ -159,107 +169,105 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, c
Iterator in(src, win);
Iterator out(dst, win);
- if(bias != nullptr)
+ if (bias != nullptr)
{
Window win_biases;
win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
Iterator bias_i(bias, win_biases);
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Compute 16 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- int32x4x4_t in_s32 =
+ // Compute 16 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
- }
- };
-
- const int32x4x4_t bias_s32 =
+ int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+ const int32x4x4_t bias_s32 = {
+ {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+ // Add the bias to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+ // Add the offset terms to GEMM's result and multiply by result_mult_int
+ scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+ finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
- }
- };
-
- // Add the bias to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
- // Add the offset terms to GEMM's result and multiply by result_mult_int
- scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
- wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
- int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
-
- // Quantize
- in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
-
- // Store the result
- *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
- }
- },
- in, bias_i, out);
+ const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
+ int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
+
+ // Quantize
+ in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) *
+ _output_stage->gemmlowp_multiplier) >>
+ _output_stage->gemmlowp_shift;
+
+ // Store the result
+ *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+ }
+ },
+ in, bias_i, out);
}
else
{
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Compute 16 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- int32x4x4_t in_s32 =
+ // Compute 16 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
- }
- };
-
- // Add the offset terms to GEMM's result and multiply by result_mult_int
- scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
- wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
+ int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+ // Add the offset terms to GEMM's result and multiply by result_mult_int
+ scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+ finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
- // Quantize
- in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
+ // Quantize
+ in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >>
+ _output_stage->gemmlowp_shift;
- // Store the result
- *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
- }
- },
- in, out);
+ // Store the result
+ *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+ }
+ },
+ in, out);
}
}
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *output_stage)
{
ARM_COMPUTE_UNUSED(bias);
// Perform validate step
@@ -268,10 +276,7 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
- bias,
- dst,
- output_stage));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
_output_stage = output_stage;
@@ -281,14 +286,17 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso
ICpuKernel::configure(win);
// Check if we need to clamp the result using min and max
- _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound)
- && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
- && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
- if(_output_stage->output_data_type == DataType::QASYMM8)
+ _is_bounded_relu =
+ ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) &&
+ !(_output_stage->gemmlowp_min_bound ==
+ std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) &&
+ _output_stage->gemmlowp_max_bound ==
+ std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
+ if (_output_stage->output_data_type == DataType::QASYMM8)
{
_func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>;
}
- else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
+ else if (_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
{
_func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>;
}
@@ -298,7 +306,10 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso
}
}
-Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *output_stage)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
return Status{};
@@ -323,4 +334,4 @@ const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const
}
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
index c7813edcd7..33e296b251 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -71,10 +72,13 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo *output_stage);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
@@ -95,11 +99,14 @@ private:
* @param[out] dst Output tensor info
* @param[in] window Region on which to execute the kernel.
*/
- using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+ using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window);
- QuantizeDownFunctionPtr _func{ nullptr };
- const GEMMLowpOutputStageInfo *_output_stage{ nullptr };
- bool _is_bounded_relu{ false };
+ QuantizeDownFunctionPtr _func{nullptr};
+ const GEMMLowpOutputStageInfo *_output_stage{nullptr};
+ bool _is_bounded_relu{false};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index 53ca991889..a5c09c9977 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
@@ -29,12 +29,13 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NESymm.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NESymm.h"
#include <arm_neon.h>
@@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
ARM_COMPUTE_RETURN_ERROR_ON(min > max);
// Check biases if exist
- if(bias != nullptr)
+ if (bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
}
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
} // namespace
template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window)
{
const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min));
const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max));
@@ -88,92 +92,92 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(co
Iterator in(src, win_collapsed);
Iterator out(dst, win_collapsed);
- if(bias != nullptr)
+ if (bias != nullptr)
{
Window win_biases;
win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
Iterator bias_i(bias, win_biases);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- // Compute 16 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- int32x4x2_t in_s32 =
+ // Compute 16 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
- }
- };
+ int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
- const int32x4x2_t bias_s32 =
- {
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)
- }
- };
+ const int32x4x2_t bias_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)}};
- // Add the bias to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+ // Add the bias to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
- vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
- }
+ vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+ finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+ _result_shift, min_s16, max_s16));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
- int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
- // Add bias
- in_value += bias_value;
- // Finalize and store the result
- *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
- static_cast<int16_t>(_max));
- }
- },
- in, out, bias_i);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+ int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+ // Add bias
+ in_value += bias_value;
+ // Finalize and store the result
+ *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+ in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+ static_cast<int16_t>(_max));
+ }
+ },
+ in, out, bias_i);
}
else
{
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- // Compute 16 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- int32x4x2_t in_s32 =
+ // Compute 16 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
- }
- };
+ int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
- vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
- }
+ vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+ finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+ _result_shift, min_s16, max_s16));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
- ARM_COMPUTE_UNUSED(in_value);
- // Finalize and store the result
- *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
- static_cast<int16_t>(_max));
- }
- },
- in, out);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+ ARM_COMPUTE_UNUSED(in_value);
+ // Finalize and store the result
+ *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+ in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+ static_cast<int16_t>(_max));
+ }
+ },
+ in, out);
}
}
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
- int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int min,
+ int max)
{
// Perform validate step
ARM_COMPUTE_UNUSED(bias, dst);
@@ -193,18 +197,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITens
// Check if we need to clamp the result using min and max
const bool is_bounded_relu = !(min <= -32768 && max >= 32767);
- _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true> :
- &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
+ _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true>
+ : &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
}
-Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+ const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
return Status{};
}
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 681d099695..925788b680 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -48,7 +49,8 @@ namespace kernels
* -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
*
*/
-class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+ : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
{
public:
CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
@@ -65,17 +67,24 @@ public:
* @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
* Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
*/
- void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
+ void configure(ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int min = 0,
+ int max = 0);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+ static Status
+ validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
@@ -97,13 +106,13 @@ private:
* @param[in] window Region on which to execute the kernel.
*/
using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(
- const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+ const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
- QuantizeDownFunctionPtr _func{ nullptr };
- int _result_fixedpoint_multiplier{ 0 };
- int _result_shift{ 0 };
- int _min{ 0 };
- int _max{ 0 };
+ QuantizeDownFunctionPtr _func{nullptr};
+ int _result_fixedpoint_multiplier{0};
+ int _result_shift{0};
+ int _min{0};
+ int _max{0};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index 27214dcb5a..0e58097073 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -29,12 +29,13 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
#include <arm_neon.h>
@@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
ARM_COMPUTE_RETURN_ERROR_ON(min > max);
// Check biases if exist
- if(bias != nullptr)
+ if (bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
}
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
} // namespace
template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window)
{
const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(_min));
@@ -88,102 +92,102 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(con
Iterator in(src, win_collapsed);
Iterator out(dst, win_collapsed);
- if(bias != nullptr)
+ if (bias != nullptr)
{
Window win_biases;
win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
Iterator bias_i(bias, win_biases);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- // Compute 16 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- int32x4x4_t in_s32 =
+ // Compute 16 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
- }
- };
-
- const int32x4x4_t bias_s32 =
+ int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+ const int32x4x4_t bias_s32 = {
+ {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+ // Add the bias to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+ vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+ finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+ result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
- }
- };
-
- // Add the bias to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
- vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
- finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
- int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
- // Add bias
- in_value += bias_value;
- // Finalize and store the result
- *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
- static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
- }
- },
- in, out, bias_i);
+ const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+ int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+ // Add bias
+ in_value += bias_value;
+ // Finalize and store the result
+ *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+ in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+ static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+ }
+ },
+ in, out, bias_i);
}
else
{
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- // Compute 16 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- int32x4x4_t in_s32 =
+ // Compute 16 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
- }
- };
-
- vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
- finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
- // Finalize and store the result
- *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
- static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
- }
- },
- in, out);
+ int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+ vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+ finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+ result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+ // Finalize and store the result
+ *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+ in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+ static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+ }
+ },
+ in, out);
}
}
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift, int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int result_offset_after_shift,
+ int min,
+ int max)
{
ARM_COMPUTE_UNUSED(bias);
// Perform validate step
@@ -205,18 +209,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITenso
// Check if we need to clamp the result using min and max
const bool is_bounded_relu = !(min <= -128 && max >= 127);
- _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true> :
- &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
+ _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true>
+ : &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
}
-Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+ const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
return Status{};
}
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index 3e615b935e..6a67ba4f19 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -49,7 +50,8 @@ namespace kernels
* -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
*
*/
-class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+ : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
{
public:
CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
@@ -67,17 +69,25 @@ public:
* @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
* Along with @p min, this value can be used to implement "rectified linear unit" activation functions
*/
- void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+ void configure(ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int result_offset_after_shift,
+ int min = 0,
+ int max = 0);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+ static Status
+ validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
@@ -99,14 +109,14 @@ private:
* @param[in] window Region on which to execute the kernel.
*/
using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(
- const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+ const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
- QuantizeDownFunctionPtr _func{ nullptr };
- int _result_fixedpoint_multiplier{ 0 };
- int _result_shift{ 0 };
- int _result_offset_after_shift{ 0 };
- int _min{ 0 };
- int _max{ 0 };
+ QuantizeDownFunctionPtr _func{nullptr};
+ int _result_fixedpoint_multiplier{0};
+ int _result_shift{0};
+ int _result_offset_after_shift{0};
+ int _min{0};
+ int _max{0};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index e49fd29115..e3dd2240ca 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -29,12 +29,13 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
#include <arm_neon.h>
@@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
ARM_COMPUTE_RETURN_ERROR_ON(min > max);
// Check biases if exist
- if(bias != nullptr)
+ if (bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
}
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
} // namespace
template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window)
{
const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(_min));
@@ -89,98 +93,102 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(co
Iterator in(src, win_collapsed);
Iterator out(dst, win_collapsed);
- if(bias != nullptr)
+ if (bias != nullptr)
{
Window win_biases;
win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
Iterator bias_i(bias, win_biases);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- // Compute 16 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- int32x4x4_t in_s32 =
+ // Compute 16 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
- }
- };
-
- const int32x4x4_t bias_s32 =
+ int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+ const int32x4x4_t bias_s32 = {
+ {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+ // Add the bias to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+ vst1q_u8(out.ptr() + x,
+ finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+ result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
- }
- };
-
- // Add the bias to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
- vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
- int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
- // Add bias
- in_value += bias_value;
- // Finalize and store the result
- *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
- }
- },
- in, out, bias_i);
+ const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+ int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+ // Add bias
+ in_value += bias_value;
+ // Finalize and store the result
+ *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+ _result_offset_after_shift, static_cast<uint8_t>(_min),
+ static_cast<uint8_t>(_max), is_bounded_relu);
+ }
+ },
+ in, out, bias_i);
}
else
{
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- // Compute 16 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- int32x4x4_t in_s32 =
+ // Compute 16 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
- }
- };
-
- vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
- // Finalize and store the result
- *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
- }
- },
- in, out);
+ int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+ vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+ vst1q_u8(out.ptr() + x,
+ finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+ result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+ // Finalize and store the result
+ *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+ _result_offset_after_shift, static_cast<uint8_t>(_min),
+ static_cast<uint8_t>(_max), is_bounded_relu);
+ }
+ },
+ in, out);
}
}
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift, int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int result_offset_after_shift,
+ int min,
+ int max)
{
ARM_COMPUTE_UNUSED(bias);
// Perform validate step
@@ -202,18 +210,21 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITens
// Check if we need to clamp the result using min and max
const bool is_bounded_relu = !(min <= 0 && max >= 255);
- _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true> :
- &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
+ _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true>
+ : &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
}
-Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+ const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
return Status{};
}
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors,
+ const Window &window,
+ const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -233,4 +244,4 @@ const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() c
}
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index b773fdfdcf..45bd742a70 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -49,7 +50,8 @@ namespace kernels
* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
*
*/
-class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
{
public:
CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
@@ -67,17 +69,25 @@ public:
* @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
* Along with @p min, this value can be used to implement "rectified linear unit" activation functions
*/
- void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+ void configure(ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ int result_fixedpoint_multiplier,
+ int result_shift,
+ int result_offset_after_shift,
+ int min = 0,
+ int max = 0);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+ static Status
+ validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
@@ -93,14 +103,14 @@ private:
* @param[in] window Region on which to execute the kernel.
*/
using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(
- const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+ const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
- QuantizeDownFunctionPtr _func{ nullptr };
- int _result_fixedpoint_multiplier{ 0 };
- int _result_shift{ 0 };
- int _result_offset_after_shift{ 0 };
- int _min{ 0 };
- int _max{ 0 };
+ QuantizeDownFunctionPtr _func{nullptr};
+ int _result_fixedpoint_multiplier{0};
+ int _result_shift{0};
+ int _result_offset_after_shift{0};
+ int _min{0};
+ int _max{0};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
index 6399ebbef4..fb1b70b91f 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
#include "src/cpu/kernels/gemm_matrix_add/list.h"
namespace arm_compute
{
@@ -40,24 +41,12 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels =
-{
- {
- "neon_fp32_gemm_matrix_add",
- [](const DataTypeISASelectorData & data)
- {
- return (data.dt == DataType::F32);
- },
- REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)
- },
- {
- "neon_fp16_gemm_matrix_add",
- [](const DataTypeISASelectorData & data)
- {
- return (data.dt == DataType::F16) && data.isa.fp16;
- },
- REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)
- },
+static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels = {
+ {"neon_fp32_gemm_matrix_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)},
+ {"neon_fp16_gemm_matrix_add",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+ REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)},
};
} // namespace
@@ -71,7 +60,8 @@ void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo
ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta));
_beta = beta;
- const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(
+ DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
_func = uk->ukernel;
// Configure kernel window
@@ -87,7 +77,7 @@ Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITens
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
- if(dst->total_size() > 0)
+ if (dst->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -105,7 +95,7 @@ void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &win
const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
- if(_beta != 0.0f)
+ if (_beta != 0.0f)
{
(*_func)(src, dst, window, _beta);
}
@@ -116,7 +106,8 @@ const char *CpuGemmMatrixAdditionKernel::name() const
return "CpuGemmMatrixAdditionKernel";
}
-const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &CpuGemmMatrixAdditionKernel::get_available_kernels()
+const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &
+CpuGemmMatrixAdditionKernel::get_available_kernels()
{
return available_kernels;
}
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
index cbc5b53087..5e12f1dcbd 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
@@ -75,7 +75,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
static const std::vector<GemmMatrixAddKernel> &get_available_kernels();
@@ -89,8 +89,8 @@ private:
* @param[in] beta Weight of matrix C
*/
/** Matrix addition function to use for the particular tensor types passed to configure() */
- GemmMatrixAddKernelPtr _func{ nullptr };
- float _beta{ 0.f };
+ GemmMatrixAddKernelPtr _func{nullptr};
+ float _beta{0.f};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
index 03b372efd4..beccd94844 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
@@ -26,10 +26,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/gemm_matrix_mul/list.h"
@@ -42,27 +43,20 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels =
-{
- {
- "neon_fp32_gemm_matrix_mul",
- [](const DataTypeISASelectorData & data)
- {
- return (data.dt == DataType::F32);
- },
- REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)
- },
- {
- "neon_fp16_gemm_matrix_mul",
- [](const DataTypeISASelectorData & data)
- {
- return (data.dt == DataType::F16) && data.isa.fp16;
- },
- REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)
- },
+static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels = {
+ {"neon_fp32_gemm_matrix_mul", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)},
+ {"neon_fp16_gemm_matrix_mul",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+ REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)},
};
-inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+inline Status validate_arguments(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ float alpha,
+ bool is_interleaved,
+ const GEMMReshapeInfo &reshape_info)
{
ARM_COMPUTE_UNUSED(alpha);
@@ -70,11 +64,11 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs,
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
- if(!is_interleaved)
+ if (!is_interleaved)
{
ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1));
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0));
ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1));
@@ -90,28 +84,31 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs,
const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
/* Interleave */
- TensorShape tensor_shape0{ lhs->tensor_shape() };
+ TensorShape tensor_shape0{lhs->tensor_shape()};
tensor_shape0.set(0, k);
tensor_shape0.set(1, m);
const TensorInfo tensor_info0 = lhs->clone()->set_tensor_shape(tensor_shape0);
- const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+ const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0);
- if(n != 0) /* Transpose */
+ if (n != 0) /* Transpose */
{
- TensorShape tensor_shape1{ rhs->tensor_shape() };
+ TensorShape tensor_shape1{rhs->tensor_shape()};
tensor_shape1.set(0, n);
tensor_shape1.set(1, k);
- const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1);
- const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+ const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1);
+ const TensorInfo tensor_info_reshaped1 =
+ rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(
+ tensor_info1, mult_transpose1xW_width));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1);
}
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
- if(n != 0)
+ if (n != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n));
}
@@ -125,12 +122,17 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs,
} // namespace
-void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ ITensorInfo *dst,
+ float alpha,
+ bool is_interleaved,
+ const GEMMReshapeInfo &reshape_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
// dst tensor auto inizialitation if not yet initialized
- TensorShape tensor_shape{ lhs->tensor_shape() };
+ TensorShape tensor_shape{lhs->tensor_shape()};
tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0));
tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1));
@@ -146,7 +148,7 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso
// Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication
const bool is_dst_vector = (dst->dimension(1) == 1);
- if(is_dst_vector)
+ if (is_dst_vector)
{
const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32;
@@ -157,17 +159,23 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso
constexpr unsigned int num_elems_processed_per_iteration_x = 8;
constexpr unsigned int num_elems_processed_per_iteration_y = 4;
- win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win =
+ calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
}
- const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ lhs->data_type(), CPUInfo::get().get_isa() });
+ const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(
+ DataTypeISASelectorData{lhs->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
_func = uk->ukernel;
ICPPKernel::configure(win);
}
-Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved,
+Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ float alpha,
+ bool is_interleaved,
const GEMMReshapeInfo &reshape_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info));
@@ -195,7 +203,8 @@ const char *CpuGemmMatrixMultiplyKernel::name() const
return "CpuGemmMatrixMultiplyKernel";
}
-const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &CpuGemmMatrixMultiplyKernel::get_available_kernels()
+const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &
+CpuGemmMatrixMultiplyKernel::get_available_kernels()
{
return available_kernels;
}
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
index a7dfec87bd..765fcb8275 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
@@ -42,7 +42,8 @@ namespace kernels
class CpuGemmMatrixMultiplyKernel : public ICpuKernel<CpuGemmMatrixMultiplyKernel>
{
private:
- using GemmMatrixMulKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type;
+ using GemmMatrixMulKernelPtr = std::add_pointer<void(
+ const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type;
public:
struct GemmMatrixMulKernel
@@ -67,17 +68,27 @@ public:
* @param[in] is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel
* @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped
*/
- void configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
+ void configure(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ ITensorInfo *dst,
+ float alpha,
+ bool is_interleaved,
+ const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel
*
* Similar to @ref CpuGemmMatrixMultiplyKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info);
+ static Status validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ float alpha,
+ bool is_interleaved,
+ const GEMMReshapeInfo &reshape_info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
static const std::vector<GemmMatrixMulKernel> &get_available_kernels();
@@ -94,8 +105,8 @@ private:
*/
/** Matrix multiply function to use for the particular tensor types passed to configure() */
- GemmMatrixMulKernelPtr _func{ nullptr };
- float _alpha{ 1.f };
+ GemmMatrixMulKernelPtr _func{nullptr};
+ float _alpha{1.f};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
index 62d5d5f5e9..c47746bc4b 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
@@ -24,9 +24,10 @@
#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -63,9 +64,10 @@ Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensor
ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_transpose1xW_with_element_size_shape(*src));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+ compute_transpose1xW_with_element_size_shape(*src));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
}
@@ -107,25 +109,28 @@ void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &windo
const size_t out_stride = dst->info()->strides_in_bytes()[1];
const size_t vector_size = 16 / element_size;
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const uint8_t *in_ptr = in.ptr();
- uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
-
- for(size_t k = 0; k < vector_size; ++k)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- // If the src width is not multiple of W, we fill the reference with 0s
- if((id.x() + k) >= in_width)
- {
- std::memset(out_ptr + k * element_size, 0, element_size);
- }
- else
+ const uint8_t *in_ptr = in.ptr();
+ uint8_t *const out_ptr =
+ out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
+
+ for (size_t k = 0; k < vector_size; ++k)
{
- std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+ // If the src width is not multiple of W, we fill the reference with 0s
+ if ((id.x() + k) >= in_width)
+ {
+ std::memset(out_ptr + k * element_size, 0, element_size);
+ }
+ else
+ {
+ std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+ }
}
- }
- },
- in, out);
+ },
+ in, out);
}
const char *CpuGemmTranspose1xWKernel::name() const
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
index 0ca92641b7..4b834b2cc6 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
@@ -88,7 +88,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp
index 9ac291549b..55ac7c5192 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.cpp
+++ b/src/cpu/kernels/CpuIm2ColKernel.cpp
@@ -29,13 +29,13 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
#include <arm_neon.h>
#include <cstddef>
#include <cstdint>
@@ -51,26 +51,34 @@ namespace kernels
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation,
+ unsigned int num_groups,
+ unsigned int input_pad_right)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::BFLOAT16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
// Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
- const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
+ const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
const unsigned total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
- if(output->total_size() > 0)
+ if (output->total_size() > 0)
{
- TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
+ TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(
+ input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -106,14 +114,14 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
// This for loop linearize a volume with 3 slices. This allows:
// 1) to reduce the iterations of the outer for loop "d"
// 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
- for(; d <= (kernel_depth - 3); d += 3)
+ for (; d <= (kernel_depth - 3); d += 3)
{
- for(int y = top_left_y; y < y_e; y += dilation_y)
+ for (int y = top_left_y; y < y_e; y += dilation_y)
{
- if((y < 0 || y >= input_h) && has_pads)
+ if ((y < 0 || y >= input_h) && has_pads)
{
// All the values will be the offset (will be zeros when not quantized)
- for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+ for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
{
*(out_ptr + 0 * kernel_size2) = pad_value;
*(out_ptr + 1 * kernel_size2) = pad_value;
@@ -122,9 +130,9 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
}
else
{
- for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+ for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
{
- if((x < 0 || x >= input_w) && has_pads)
+ if ((x < 0 || x >= input_w) && has_pads)
{
*(out_ptr + 0 * kernel_size2) = pad_value;
*(out_ptr + 1 * kernel_size2) = pad_value;
@@ -132,9 +140,12 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
}
else
{
- *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
- *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
- *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+ *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(
+ in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+ *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(
+ in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+ *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(
+ in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
}
}
}
@@ -143,11 +154,11 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
}
// Left over
- for(; d < kernel_depth; d++)
+ for (; d < kernel_depth; d++)
{
- for(int y = top_left_y; y < y_e; y += dilation_y)
+ for (int y = top_left_y; y < y_e; y += dilation_y)
{
- if((y < 0 || y >= input_h) && has_pads)
+ if ((y < 0 || y >= input_h) && has_pads)
{
// All the values will be the offset (will be zeros when not quantized)
memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
@@ -155,15 +166,16 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
}
else
{
- for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+ for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
{
- if((x < 0 || x >= input_w) && has_pads)
+ if ((x < 0 || x >= input_w) && has_pads)
{
*out_ptr = pad_value;
}
else
{
- *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+ *out_ptr = *(reinterpret_cast<const T *>(
+ in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
}
}
}
@@ -171,7 +183,7 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
}
// Append 1 if the convolution layer has biases
- if(has_bias)
+ if (has_bias)
{
*out_ptr = static_cast<T>(1);
}
@@ -198,36 +210,39 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
const int end_y = start_y + kernel_height * dilation_y;
const int pad_quant = kernel_width * input_c;
const int element_size = static_cast<int>(sizeof(T));
- if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size))
+ if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+ (input_stride_y == input_c * element_size))
{
- for(int y = start_y; y < end_y; y += dilation_y)
+ for (int y = start_y; y < end_y; y += dilation_y)
{
//optimized for no dilation and no boundary pixels
- memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
+ memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+ input_c * kernel_width * element_size);
out_ptr += input_c * kernel_width;
}
}
else
{
- for(int y = start_y; y < end_y; y += dilation_y)
+ for (int y = start_y; y < end_y; y += dilation_y)
{
- if(y < 0 || y >= input_h)
+ if (y < 0 || y >= input_h)
{
memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
out_ptr += pad_quant;
}
- else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
+ else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
{
- for(int x = start_x; x < end_x; x += dilation_x)
+ for (int x = start_x; x < end_x; x += dilation_x)
{
- if(x < 0 || x >= input_w)
+ if (x < 0 || x >= input_w)
{
memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
out_ptr += input_c;
}
else
{
- memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size);
+ memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+ input_c * element_size);
out_ptr += input_c;
}
}
@@ -235,13 +250,14 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
else
{
//optimized for no dilation and no boundary pixels
- memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
+ memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+ input_c * kernel_width * element_size);
out_ptr += input_c * kernel_width;
}
}
}
// Append 1 if the convolution layer has biases
- if(has_bias)
+ if (has_bias)
{
*out_ptr = static_cast<T>(1);
}
@@ -271,12 +287,13 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
const int element_size = static_cast<int>(sizeof(T));
const int channel_chunk_size = input_c * element_size;
- if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == channel_chunk_size))
+ if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+ (input_stride_y == channel_chunk_size))
{
- for(int y = start_y; y < end_y; y += dilation_y)
+ for (int y = start_y; y < end_y; y += dilation_y)
{
const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
- for(int e = 0; e < kernel_width; e++)
+ for (int e = 0; e < kernel_width; e++)
{
memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
out_ptr += input_c + pad_right;
@@ -285,25 +302,26 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
}
else
{
- for(int y = start_y; y < end_y; y += dilation_y)
+ for (int y = start_y; y < end_y; y += dilation_y)
{
- if(y < 0 || y >= input_h)
+ if (y < 0 || y >= input_h)
{
memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
out_ptr += pad_quant;
}
- else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
+ else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
{
- for(int x = start_x; x < end_x; x += dilation_x)
+ for (int x = start_x; x < end_x; x += dilation_x)
{
- if(x < 0 || x >= input_w)
+ if (x < 0 || x >= input_w)
{
memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size);
out_ptr += input_c + pad_right;
}
else
{
- memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), channel_chunk_size);
+ memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+ channel_chunk_size);
out_ptr += input_c + pad_right;
}
}
@@ -311,16 +329,17 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
else
{
const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
- for(int e = 0; e < kernel_width; e++)
+ for (int e = 0; e < kernel_width; e++)
{
- memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
+ memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size),
+ channel_chunk_size);
out_ptr += input_c + pad_right;
}
}
}
}
// Append 1 if the convolution layer has biases
- if(has_bias)
+ if (has_bias)
{
*out_ptr = static_cast<T>(1);
}
@@ -348,7 +367,8 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window
const int pad_top = _conv_info.pad_top();
const int stride_x = _conv_info.stride().first;
const int stride_y = _conv_info.stride().second;
- const int pad_value = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
+ const int pad_value =
+ is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
Window window_in_out(window);
// The first three dimensions of the input and output are increased by the inner loops
@@ -361,84 +381,57 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window
Iterator out(dst, window_in_out);
execute_window_loop(
- window, [&](const Coordinates & id)
- {
- const int start_w = id[width_idx] * stride_x - pad_left;
- const int start_h = id[height_idx] * stride_y - pad_top;
+ window,
+ [&](const Coordinates &id)
+ {
+ const int start_w = id[width_idx] * stride_x - pad_left;
+ const int start_h = id[height_idx] * stride_y - pad_top;
- // Get pointers
- const uint8_t *const input_ptr = in.ptr();
- auto output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y());
+ // Get pointers
+ const uint8_t *const input_ptr = in.ptr();
+ auto output_ptr =
+ reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) *
+ dst->info()->strides_in_bytes().y());
- // Linearize volume
- if(is_nchw)
- {
- linearize_volume_nchw<T, has_pads>(input_ptr,
- output_ptr,
- _has_bias,
- start_w,
- start_h,
- _kernel_width,
- _kernel_height,
- input_c,
- input_w,
- input_h,
- input_stride_x,
- input_stride_y,
- input_stride_z,
- pad_value,
- _dilation.x(),
- _dilation.y());
- }
- else
- {
- if(_input_pad_right > 0)
+ // Linearize volume
+ if (is_nchw)
{
- linearize_volume_nhwc<T, has_pads>(input_ptr,
- output_ptr,
- _has_bias,
- start_w,
- start_h,
- _kernel_width,
- _kernel_height,
- input_w,
- input_h,
- input_c,
- input_stride_y,
- input_stride_z,
- pad_value,
- _dilation.x(),
- _dilation.y(),
- _input_pad_right);
+ linearize_volume_nchw<T, has_pads>(
+ input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_c, input_w,
+ input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y());
}
else
{
- linearize_volume_nhwc<T, has_pads>(input_ptr,
- output_ptr,
- _has_bias,
- start_w,
- start_h,
- _kernel_width,
- _kernel_height,
- input_w,
- input_h,
- input_c,
- input_stride_y,
- input_stride_z,
- pad_value,
- _dilation.x(),
- _dilation.y());
+ if (_input_pad_right > 0)
+ {
+ linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, _has_bias, start_w, start_h,
+ _kernel_width, _kernel_height, input_w, input_h, input_c,
+ input_stride_y, input_stride_z, pad_value, _dilation.x(),
+ _dilation.y(), _input_pad_right);
+ }
+ else
+ {
+ linearize_volume_nhwc<T, has_pads>(
+ input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_w,
+ input_h, input_c, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y());
+ }
}
- }
- },
- in, out);
+ },
+ in, out);
}
-void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+void CpuIm2ColKernel::configure(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation,
+ unsigned int num_groups,
+ unsigned int input_pad_right)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
ARM_COMPUTE_UNUSED(num_groups);
_data_layout = src->data_layout();
@@ -451,31 +444,34 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
_kernel_height = kernel_dims.height;
_input_pad_right = input_pad_right;
_dilation = dilation;
- _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx),
- _kernel_width, _kernel_height,
- _conv_info, _dilation);
+ _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width,
+ _kernel_height, _conv_info, _dilation);
_has_bias = has_bias;
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::F32:
- _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>;
+ _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true>
+ : &CpuIm2ColKernel::run_im2col<float, true, true>;
break;
#if defined(ARM_COMPUTE_ENABLE_BF16)
case DataType::BFLOAT16:
- _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
+ _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true>
+ : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
break;
#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
+ _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true>
+ : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::QASYMM8_SIGNED:
case DataType::QASYMM8:
- _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
+ _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true>
+ : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
@@ -484,26 +480,31 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
}
else
{
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::F32:
- _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>;
+ _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false>
+ : &CpuIm2ColKernel::run_im2col<float, true, false>;
break;
#if defined(ARM_COMPUTE_ENABLE_BF16)
case DataType::BFLOAT16:
- _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
+ _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false>
+ : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
break;
#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
+ _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false>
+ : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::QASYMM8:
- _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+ _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false>
+ : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
break;
case DataType::QASYMM8_SIGNED:
- _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+ _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false>
+ : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
break;
default:
ARM_COMPUTE_ERROR("Data type not supported");
@@ -512,11 +513,13 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
}
// Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false, num_groups, _input_pad_right)));
+ auto_init_if_empty(
+ *dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation,
+ false, num_groups, _input_pad_right)));
- std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx),
- kernel_dims.width, kernel_dims.height,
- conv_info, dilation);
+ std::pair<unsigned int, unsigned int> convolved_dims =
+ scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height,
+ conv_info, dilation);
Window win = calculate_max_window(*src, Steps());
win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
@@ -526,10 +529,17 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
ICpuKernel::configure(win);
}
-Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+Status CpuIm2ColKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation,
+ unsigned int num_groups,
+ unsigned int input_pad_right)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
return Status{};
}
diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h
index d133f8dc2d..2cb26179ce 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.h
+++ b/src/cpu/kernels/CpuIm2ColKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_IM2COL_KERNEL_H
#include "arm_compute/core/Size2D.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -78,16 +79,28 @@ public:
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
* @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
*/
- void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0);
+ void configure(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation = Size2D(1U, 1U),
+ unsigned int num_groups = 1,
+ unsigned int input_pad_right = 0);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuIm2ColKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
- bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const Size2D &kernel_dims,
+ const PadStrideInfo &conv_info,
+ bool has_bias,
+ const Size2D &dilation = Size2D(1U, 1U),
+ unsigned int num_groups = 1,
+ unsigned int input_pad_right = 0);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -117,15 +130,15 @@ private:
*/
using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
- Im2ColFunctionPtr _func{ nullptr };
+ Im2ColFunctionPtr _func{nullptr};
std::pair<unsigned int, unsigned int> _convolved_dims{};
PadStrideInfo _conv_info{};
- unsigned int _kernel_width{ 0 };
- unsigned int _kernel_height{ 0 };
- unsigned int _input_pad_right{ 0 };
- bool _has_bias{ false };
- Size2D _dilation{ 1U, 1U };
- DataLayout _data_layout{ DataLayout::UNKNOWN };
+ unsigned int _kernel_width{0};
+ unsigned int _kernel_height{0};
+ unsigned int _input_pad_right{0};
+ bool _has_bias{false};
+ Size2D _dilation{1U, 1U};
+ DataLayout _data_layout{DataLayout::UNKNOWN};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index 39adc9af7c..b7daa4d583 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
#include "arm_compute/core/Types.h"
+
#include "src/common/cpuinfo/CpuIsaInfo.h"
namespace arm_compute
@@ -78,10 +79,10 @@ struct DepthwiseConv2dNativeDataTypeISASelectorData
struct ActivationDataTypeISASelectorData
{
- DataType dt;
- const CPUModel &cpumodel;
- const cpuinfo::CpuIsaInfo &isa;
- const ActivationFunction f;
+ DataType dt;
+ const CPUModel &cpumodel;
+ const cpuinfo::CpuIsaInfo &isa;
+ const ActivationFunction f;
};
struct CpuAddKernelDataTypeISASelectorData
@@ -99,15 +100,19 @@ struct ScaleKernelDataTypeISASelectorData
};
// Selector pointer types
-using DataTypeISASelectorPtr = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
-using DataTypeDataLayoutSelectorPtr = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
-using PoolDataTypeISASelectorPtr = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
-using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
-using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
-using CastDataTypeISASelectorDataPtr = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type;
-using ActivationDataTypeISASelectorDataPtr = std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type;
-using CpuAddKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
-using ScaleKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
+using DataTypeISASelectorPtr = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
+using DataTypeDataLayoutSelectorPtr = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
+using PoolDataTypeISASelectorPtr = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
+using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
+using DepthwiseConv2dNativeDataTypeISASelectorPtr =
+ std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
+using CastDataTypeISASelectorDataPtr = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type;
+using ActivationDataTypeISASelectorDataPtr =
+ std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type;
+using CpuAddKernelDataTypeISASelectorDataPtr =
+ std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
+using ScaleKernelDataTypeISASelectorDataPtr =
+ std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
index 7d077c75bf..bcaa76b99b 100644
--- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
@@ -24,11 +24,12 @@
#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/maxunpool/list.h"
@@ -43,50 +44,43 @@ using namespace misc::shape_calculator;
namespace
{
-static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels =
-{
- {
- "neon_fp32_maxunpooling",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(neon_fp32_maxunpooling)
- },
- {
- "neon_fp16_maxunpooling",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(neon_fp16_maxunpooling)
- },
- {
- "neon_qu8_maxunpooling",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)
- },
- {
- "neon_qs8_maxunpooling",
- [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)
- },
+static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels = {
+ {"neon_fp32_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(neon_fp32_maxunpooling)},
+ {"neon_fp16_maxunpooling",
+ [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(neon_fp16_maxunpooling)},
+ {"neon_qu8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)},
+ {"neon_qs8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)},
};
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, indices);
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- PoolingType pool_type = pool_info.pool_type;
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ PoolingType pool_type = pool_info.pool_type;
+ const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- const int pool_size_x = pool_info.pool_size.width;
- const int pool_size_y = pool_info.pool_size.height;
+ const int pool_size_x = pool_info.pool_size.width;
+ const int pool_size_y = pool_info.pool_size.height;
const Size2D pool_size(pool_size_x, pool_size_y);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+ "Pooling indices only supported for MAX pooling method");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -96,13 +90,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, co
}
} // namespace
-void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, indices);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, indices, dst, pool_info));
ARM_COMPUTE_UNUSED(indices);
- const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(
+ DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
_run_method = uk->ukernel;
@@ -113,7 +111,10 @@ void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensor
ICpuKernel::configure(window);
}
-Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, indices, dst, pool_info));
diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
index d0c13471c8..5a641a2bea 100644
--- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
@@ -37,7 +37,8 @@ namespace kernels
class CpuMaxUnpoolingLayerKernel : public ICpuKernel<CpuMaxUnpoolingLayerKernel>
{
private:
- using MaxUnpoolingUKernelPtr = std::add_pointer<void(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type;
+ using MaxUnpoolingUKernelPtr = std::add_pointer<void(
+ const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type;
public:
/** Default constructor */
@@ -56,7 +57,8 @@ public:
* @param[out] dst Destination tensor. Data types supported: Same as @p src
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
- void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+ void
+ configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration of @ref CpuMaxUnpoolingLayerKernel
*
* @param[in] src Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -66,7 +68,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -83,7 +88,7 @@ public:
const char *name() const override;
private:
- MaxUnpoolingUKernelPtr _run_method{ nullptr };
+ MaxUnpoolingUKernelPtr _run_method{nullptr};
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index b73d2bdf73..ba086e3ac6 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -25,23 +25,24 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
+
#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/NESymm.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
#include <arm_neon.h>
namespace
{
#if defined(ENABLE_FP32_KERNELS)
- static constexpr size_t default_mws_N1_fp32_neon = 22447;
- static constexpr size_t default_mws_V1_fp32_neon = 38982;
+static constexpr size_t default_mws_N1_fp32_neon = 22447;
+static constexpr size_t default_mws_V1_fp32_neon = 38982;
#endif /* ENABLE_FP32_KERNELS */
- static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
-}
+static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
+} // namespace
namespace arm_compute
{
namespace cpu
@@ -54,29 +55,38 @@ const float scale255_constant = 1.f / 255.f;
const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
-inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+inline Status validate_arguments(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy)
{
ARM_COMPUTE_UNUSED(overflow_policy);
ARM_COMPUTE_UNUSED(rounding_policy);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+ DataType::QSYMM16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+ DataType::QSYMM16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
DataType::S32, DataType::F16, DataType::F32);
- if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
+ if (is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP,
+ "ConvertPolicy cannot be WRAP if datatype is quantized");
}
- if(dst->total_size() > 0)
+ if (dst->total_size() > 0)
{
const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+ "Wrong shape for dst");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
// clang-format off
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
@@ -88,13 +98,17 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src
!(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
, "Invalid data type combination");
// clang-format on
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 &&
+ scale != 1.f,
+ "Unsupported scale for QSYMM16 inputs and S32 dst");
}
- if(std::abs(scale - scale255_constant) < 0.00001f)
+ if (std::abs(scale - scale255_constant) < 0.00001f)
{
- ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP &&
+ rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 &&
+ dst->data_type() == DataType::S32,
"Scale == 1/255 is not supported if input and dst are of data type S32");
}
else
@@ -107,7 +121,8 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src
// Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
// frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
// Moreover, it will be negative as we deal with 1/2^n
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)),
+ "Scale value not supported (Should be 1/(2^n) or 1/255");
}
return Status{};
@@ -168,9 +183,9 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
- const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
+ const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset};
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -178,7 +193,7 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+ const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
// Clear X Dimension on execution window as we handle manually
non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -190,52 +205,52 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
-
- const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+ const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
- // Dequantize inputs
- const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
- const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
+ const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
+ const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
- const float32x4x4_t out_f32x4x4 =
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
- vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
- vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
- vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
- };
-
- // Quantize dst
- const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
- wrapper::vstore(output_ptr + x, result);
- }
+ const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+
+ // Dequantize inputs
+ const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
+ const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
+
+ const float32x4x4_t out_f32x4x4 = {
+ vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+ vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+ vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+ vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+ };
+
+ // Quantize dst
+ const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+ wrapper::vstore(output_ptr + x, result);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Dequantize inputs
- const T src1 = *(non_broadcast_input_ptr + x);
- const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
- const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
- const float tmp_f = tmp_in1 * tmp_in2;
-
- // Quantize dst
- const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
- *(output_ptr + x) = tmp_qua;
- }
- },
- broadcast_input, non_broadcast_input, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ // Dequantize inputs
+ const T src1 = *(non_broadcast_input_ptr + x);
+ const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
+ const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
+ const float tmp_f = tmp_in1 * tmp_in2;
+
+ // Quantize dst
+ const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+ *(output_ptr + x) = tmp_qua;
+ }
+ },
+ broadcast_input, non_broadcast_input, dst);
}
else
{
@@ -251,56 +266,59 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
Iterator dst(out, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const auto input1_q = wrapper::vloadq(input1_ptr + x);
- const auto input2_q = wrapper::vloadq(input2_ptr + x);
-
- // Dequantize inputs
- const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
- const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+ const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
- const float32x4x4_t out_f32x4x4 =
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
- vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
- vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
- vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
- };
-
- // Quantize dst
- const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
- wrapper::vstore(output_ptr + x, result);
- }
+ const auto input1_q = wrapper::vloadq(input1_ptr + x);
+ const auto input2_q = wrapper::vloadq(input2_ptr + x);
+
+ // Dequantize inputs
+ const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+ const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+ const float32x4x4_t out_f32x4x4 = {
+ vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+ vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+ vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+ vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+ };
+
+ // Quantize dst
+ const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+ wrapper::vstore(output_ptr + x, result);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Dequantize inputs
- const T src1 = *(input1_ptr + x);
- const T src2 = *(input2_ptr + x);
- const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
- const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
- const float tmp_f = tmp_in1 * tmp_in2;
-
- // Quantize dst
- const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
- *(output_ptr + x) = tmp_qua;
- }
- },
- input1, input2, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ // Dequantize inputs
+ const T src1 = *(input1_ptr + x);
+ const T src2 = *(input2_ptr + x);
+ const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
+ const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
+ const float tmp_f = tmp_in1 * tmp_in2;
+
+ // Quantize dst
+ const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+ *(output_ptr + x) = tmp_qua;
+ }
+ },
+ input1, input2, dst);
}
}
-bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)
+bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ float scale)
{
const auto iq0 = src0->quantization_info().uniform();
const auto iq1 = src1->quantization_info().uniform();
@@ -308,7 +326,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
- if(multiplier < -8191.f || multiplier > 8191.f)
+ if (multiplier < -8191.f || multiplier > 8191.f)
{
//The multiplier cannot be stored as a 14.18 signed fixed-point number
return false;
@@ -318,7 +336,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
const auto max_result = multiplier * (256) * (256) + offset_out;
- if(max_result > 8191.f)
+ if (max_result > 8191.f)
{
//It might not be possible to store the result as a 14.18 signed fixed-point number.
return false;
@@ -366,7 +384,7 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d
const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
// Prefix: a = non-broadcast, b = broadcast.
@@ -392,78 +410,76 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d
Iterator out_it(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
- const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
- const auto b_val = *b_ptr;
- const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0);
- const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
+ win,
+ [&](const Coordinates &)
+ {
+ const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+ const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
+ const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
- const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
- const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
+ const auto b_val = *b_ptr;
+ const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0);
+ const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
- int x = window_start_x;
+ const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
+ const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Load the inputs.
- const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
-
- // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
- const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
- const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
-
- const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
- const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
- const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
- const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
-
- const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
- const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
- const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
- const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
-
- const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
- const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
- const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
- const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
-
- // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
- const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
- const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
- const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
- const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
-
- const auto vout_15p1_0 = wrapper::vcombine(
- vout_15p1_00,
- vout_15p1_01);
-
- const auto vout_15p1_1 = wrapper::vcombine(
- vout_15p1_10,
- vout_15p1_11);
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+ int x = window_start_x;
- const auto vout_8p0 = wrapper::vcombine(
- wrapper::vqrshrn<2>(vout_15p1_0),
- wrapper::vqrshrn<2>(vout_15p1_1));
- wrapper::vstore(out_ptr + x, vout_8p0);
- }
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ // Load the inputs.
+ const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+
+ // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+ const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+ const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+
+ const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
+ const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
+ const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
+ const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
+
+ const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
+ const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
+ const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
+ const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
+
+ const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+ const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+ const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+ const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+
+ // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+ const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+ const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+ const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+ const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+
+ const auto vout_15p1_0 = wrapper::vcombine(vout_15p1_00, vout_15p1_01);
+
+ const auto vout_15p1_1 = wrapper::vcombine(vout_15p1_10, vout_15p1_11);
+ const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+ const auto vout_8p0 =
+ wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1));
+ wrapper::vstore(out_ptr + x, vout_8p0);
+ }
- //Process the left-over elements.
- for(; x < window_end_x; ++x)
- {
+ //Process the left-over elements.
+ for (; x < window_end_x; ++x)
+ {
#ifdef __aarch64__
- out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(
- b_val) - b_offset_16p0)) + out_offset_14p18)));
+ out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(
+ (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) +
+ out_offset_14p18)));
#else //__aarch64__
- out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
+ out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+ multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
#endif //__aarch64__
- }
- },
- a_input_it, b_input_it, out_it);
+ }
+ },
+ a_input_it, b_input_it, out_it);
}
else
{
@@ -481,82 +497,83 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d
Iterator out_it(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
- const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+ win,
+ [&](const Coordinates &)
+ {
+ const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+ const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+ const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
- int x = window_start_x;
+ int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Load the inputs.
- const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
- const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
-
- // Widen the input elements to signed 16-bit regardless of the input signedness.
- const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
- const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
- const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
- const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
-
- const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
- const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
- const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
- const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
-
- const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
- const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
- const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
- const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
-
- const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
- const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
- const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
- const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
-
- const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
- const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
- const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
- const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
-
- // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
- const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
- const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
- const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
- const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
-
- const auto vout_14p2_0 = wrapper::vcombine(
- vout_14p2_00,
- vout_14p2_01);
-
- const auto vout_14p2_1 = wrapper::vcombine(
- vout_14p2_10,
- vout_14p2_11);
-
- const auto vout_8p0 = wrapper::vcombine(
- wrapper::vqrshrn<2>(vout_14p2_0),
- wrapper::vqrshrn<2>(vout_14p2_1));
- wrapper::vstore(out_ptr + x, vout_8p0);
- }
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ // Load the inputs.
+ const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+ const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+
+ // Widen the input elements to signed 16-bit regardless of the input signedness.
+ const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+ const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+ const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+ const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+
+ const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
+ const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
+ const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
+ const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
+
+ const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
+ const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
+ const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
+ const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
+
+ const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
+ const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
+ const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
+ const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
+
+ const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+ const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+ const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+ const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+
+ // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+ const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+ const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+ const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+ const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+
+ const auto vout_14p2_0 = wrapper::vcombine(vout_14p2_00, vout_14p2_01);
+
+ const auto vout_14p2_1 = wrapper::vcombine(vout_14p2_10, vout_14p2_11);
+
+ const auto vout_8p0 =
+ wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1));
+ wrapper::vstore(out_ptr + x, vout_8p0);
+ }
- //Process the left-over elements.
- for(; x < window_end_x; ++x)
- {
+ //Process the left-over elements.
+ for (; x < window_end_x; ++x)
+ {
#ifdef __aarch64__
- out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(
- in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));
+ out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(
+ wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) *
+ (int32_t(in1_ptr[x]) - in1_offset_16p0)) +
+ out_offset_14p18)));
#else //__aarch64__
- out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));
+ out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+ multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) +
+ float(out_offset)));
#endif //__aarch64__
- }
- },
- in0_it, in1_it, out_it);
+ }
+ },
+ in0_it, in1_it, out_it);
}
}
-void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+void mul_saturate_QSYMM16_QSYMM16_QSYMM16(
+ const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
{
const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
@@ -580,66 +597,61 @@ void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *sr
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
+ const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset};
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const qsymm16x8x2_t input1_q =
+ const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
+
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const qsymm16x8x2_t input1_q = {{
vld1q_s16(input1_ptr + x),
vld1q_s16(input1_ptr + x + 8),
- }
- };
- const qsymm16x8x2_t input2_q =
- {
- {
+ }};
+ const qsymm16x8x2_t input2_q = {{
vld1q_s16(input2_ptr + x),
vld1q_s16(input2_ptr + x + 8),
- }
- };
+ }};
- // Dequantize inputs
- const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
- const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+ // Dequantize inputs
+ const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+ const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
- const float32x4x4_t out_f32x4x4 =
- {
- vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
- vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
- vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
- vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
- };
-
- const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
- vst1q_s16(output_ptr + x, result.val[0]);
- vst1q_s16(output_ptr + x + 8, result.val[1]);
- }
+ const float32x4x4_t out_f32x4x4 = {
+ vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+ vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+ vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+ vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+ };
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Dequantize inputs
- float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
- float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
- float tmp_f = tmp_in1 * tmp_in2;
-
- // Quantize dst, lrintf() has same rounding mode as vcombine_s16
- int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
- qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
- *(output_ptr + x) = tmp_qua;
- }
- },
- input1, input2, dst);
+ const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
+ vst1q_s16(output_ptr + x, result.val[0]);
+ vst1q_s16(output_ptr + x + 8, result.val[1]);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ // Dequantize inputs
+ float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
+ float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
+ float tmp_f = tmp_in1 * tmp_in2;
+
+ // Quantize dst, lrintf() has same rounding mode as vcombine_s16
+ int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
+ qsymm16_t tmp_qua =
+ static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+ *(output_ptr + x) = tmp_qua;
+ }
+ },
+ input1, input2, dst);
}
void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
@@ -665,74 +677,60 @@ void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *
const auto window_end_x = static_cast<int>(window.x().end());
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const qsymm16x8x2_t input1_q =
+ const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const qsymm16x8x2_t input1_q = {{
vld1q_s16(input1_ptr + x),
vld1q_s16(input1_ptr + x + 8),
- }
- };
- const qsymm16x8x2_t input2_q =
- {
- {
+ }};
+ const qsymm16x8x2_t input2_q = {{
vld1q_s16(input2_ptr + x),
vld1q_s16(input2_ptr + x + 8),
- }
- };
+ }};
- const int32x4x4_t in1_s32 =
- {
- {
+ const int32x4x4_t in1_s32 = {{
vmovl_s16(vget_low_s16(input1_q.val[0])),
vmovl_s16(vget_high_s16(input1_q.val[0])),
vmovl_s16(vget_low_s16(input1_q.val[1])),
vmovl_s16(vget_high_s16(input1_q.val[1])),
- }
- };
- const int32x4x4_t in2_s32 =
- {
- {
+ }};
+ const int32x4x4_t in2_s32 = {{
vmovl_s16(vget_low_s16(input2_q.val[0])),
vmovl_s16(vget_high_s16(input2_q.val[0])),
vmovl_s16(vget_low_s16(input2_q.val[1])),
vmovl_s16(vget_high_s16(input2_q.val[1])),
- }
- };
+ }};
- const int32x4x4_t result =
- {
- {
+ const int32x4x4_t result = {{
vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
- }
- };
+ }};
- vst1q_s32(output_ptr + x, result.val[0]);
- vst1q_s32(output_ptr + x + 4, result.val[1]);
- vst1q_s32(output_ptr + x + 8, result.val[2]);
- vst1q_s32(output_ptr + x + 12, result.val[3]);
- }
+ vst1q_s32(output_ptr + x, result.val[0]);
+ vst1q_s32(output_ptr + x + 4, result.val[1]);
+ vst1q_s32(output_ptr + x + 8, result.val[2]);
+ vst1q_s32(output_ptr + x + 12, result.val[3]);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
- *(output_ptr + x) = tmp;
- }
- },
- input1, input2, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+ *(output_ptr + x) = tmp;
+ }
+ },
+ input1, input2, dst);
}
template <bool is_scale255, bool is_sat>
@@ -757,79 +755,80 @@ void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const
const auto window_end_x = static_cast<int>(window.x().end());
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
- const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
+ const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
+ const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
- uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
- const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
- uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
- const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
+ uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
+ const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
+ uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
+ const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
- tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
- tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
+ tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
+ tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
- if(is_scale255)
- {
- tmp1_high = scale255_U16_U16(tmp1_high);
- tmp1_low = scale255_U16_U16(tmp1_low);
- }
- else
- {
- const int16x8_t vn = vdupq_n_s16(-n);
+ if (is_scale255)
+ {
+ tmp1_high = scale255_U16_U16(tmp1_high);
+ tmp1_low = scale255_U16_U16(tmp1_low);
+ }
+ else
+ {
+ const int16x8_t vn = vdupq_n_s16(-n);
- if(is_sat)
+ if (is_sat)
+ {
+ tmp1_high = vqshlq_u16(tmp1_high, vn);
+ tmp1_low = vqshlq_u16(tmp1_low, vn);
+ }
+ else
+ {
+ tmp1_high = vshlq_u16(tmp1_high, vn);
+ tmp1_low = vshlq_u16(tmp1_low, vn);
+ }
+ }
+ if (is_sat)
{
- tmp1_high = vqshlq_u16(tmp1_high, vn);
- tmp1_low = vqshlq_u16(tmp1_low, vn);
+ vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
}
else
{
- tmp1_high = vshlq_u16(tmp1_high, vn);
- tmp1_low = vshlq_u16(tmp1_low, vn);
+ vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
}
}
- if(is_sat)
- {
- vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
- }
- else
- {
- vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
- }
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
-
- if(is_scale255)
- {
- float tmp_f = static_cast<float>(tmp) * scale255_constant;
- tmp = static_cast<uint16_t>(tmp_f + 0.5f);
- }
- else
- {
- tmp >>= n;
- }
- if(is_sat && tmp > 255)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- tmp = 255;
+ uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
+
+ if (is_scale255)
+ {
+ float tmp_f = static_cast<float>(tmp) * scale255_constant;
+ tmp = static_cast<uint16_t>(tmp_f + 0.5f);
+ }
+ else
+ {
+ tmp >>= n;
+ }
+ if (is_sat && tmp > 255)
+ {
+ tmp = 255;
+ }
+ *(output_ptr + x) = static_cast<uint8_t>(tmp);
}
- *(output_ptr + x) = static_cast<uint8_t>(tmp);
- }
- },
- input1, input2, dst);
+ },
+ input1, input2, dst);
}
template <bool is_scale255, bool is_sat>
@@ -843,7 +842,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
- if(is_scale255)
+ if (is_scale255)
{
tmp1_high = scale255_S32_S32(tmp1_high);
tmp1_low = scale255_S32_S32(tmp1_low);
@@ -863,7 +862,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
- if(is_sat)
+ if (is_sat)
{
tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
@@ -875,7 +874,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
}
}
- if(is_sat)
+ if (is_sat)
{
return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
}
@@ -888,15 +887,10 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
template <bool is_scale255, bool is_sat>
inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
{
- const int16x8x2_t result =
- {
- {
- // First 8 elements
- mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
- // Second 8 elements
- mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
- }
- };
+ const int16x8x2_t result = {{// First 8 elements
+ mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
+ // Second 8 elements
+ mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)}};
return result;
}
@@ -923,67 +917,62 @@ void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, con
const auto window_end_x = static_cast<int>(window.x().end());
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const int16x8x2_t ta1 =
- {
- {
- vld1q_s16(input1_ptr + x),
- vld1q_s16(input1_ptr + x + 8),
- }
- };
- const int16x8x2_t ta2 =
- {
- {
- vld1q_s16(input2_ptr + x),
- vld1q_s16(input2_ptr + x + 8),
- }
- };
- const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
-
- vst1q_s16(output_ptr + x, result.val[0]);
- vst1q_s16(output_ptr + x + 8, result.val[1]);
- }
+ const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const int16x8x2_t ta1 = {{
+ vld1q_s16(input1_ptr + x),
+ vld1q_s16(input1_ptr + x + 8),
+ }};
+ const int16x8x2_t ta2 = {{
+ vld1q_s16(input2_ptr + x),
+ vld1q_s16(input2_ptr + x + 8),
+ }};
+ const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+ vst1q_s16(output_ptr + x, result.val[0]);
+ vst1q_s16(output_ptr + x + 8, result.val[1]);
+ }
- if(is_scale255)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- float tmp_f = static_cast<float>(tmp) * scale255_constant;
+ int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
- tmp = static_cast<int32_t>(tmp_f + 0.5f);
- }
- else
- {
- if(tmp >= 0)
+ if (is_scale255)
{
- tmp >>= n;
+ float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+ tmp = static_cast<int32_t>(tmp_f + 0.5f);
}
else
{
- uint32_t mask = (1u << n) - 1;
- tmp = (tmp + static_cast<int32_t>(mask)) >> n;
+ if (tmp >= 0)
+ {
+ tmp >>= n;
+ }
+ else
+ {
+ uint32_t mask = (1u << n) - 1;
+ tmp = (tmp + static_cast<int32_t>(mask)) >> n;
+ }
}
+ if (is_sat)
+ {
+ tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+ }
+ *(output_ptr + x) = static_cast<int16_t>(tmp);
}
- if(is_sat)
- {
- tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
- }
- *(output_ptr + x) = static_cast<int16_t>(tmp);
- }
- },
- input1, input2, dst);
+ },
+ input1, input2, dst);
}
template <bool is_sat>
@@ -1012,7 +1001,7 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &
const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
- if(is_sat)
+ if (is_sat)
{
tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
@@ -1029,15 +1018,10 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &
template <bool is_sat>
inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
{
- const int32x4x2_t result =
- {
- {
- // First 4 elements
- mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
- // Second 4 elements
- mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
- }
- };
+ const int32x4x2_t result = {{// First 4 elements
+ mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
+ // Second 4 elements
+ mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)}};
return result;
}
@@ -1058,7 +1042,7 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
const auto window_end_x = static_cast<int>(window.x().end());
const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1074,60 +1058,56 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
Iterator dst(out, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
- const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
- const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
+ const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
+ const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int32x4x2_t broadcast_v =
- {
- {
- broadcast_value_vec,
- broadcast_value_vec,
- }
- };
- const int32x4x2_t non_broadcast_v =
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const int32x4x2_t broadcast_v = {{
+ broadcast_value_vec,
+ broadcast_value_vec,
+ }};
+ const int32x4x2_t non_broadcast_v = {{
vld1q_s32(non_broadcast_input_ptr + x),
vld1q_s32(non_broadcast_input_ptr + x + 4),
- }
- };
- const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
-
- vst1q_s32(output_ptr + x, result.val[0]);
- vst1q_s32(output_ptr + x + 4, result.val[1]);
- }
+ }};
+ const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
-
- if(tmp >= 0)
- {
- tmp >>= n;
- }
- else
- {
- uint64_t mask = ((uint64_t)1u << n) - 1;
- tmp = (tmp + static_cast<int64_t>(mask)) >> n;
+ vst1q_s32(output_ptr + x, result.val[0]);
+ vst1q_s32(output_ptr + x + 4, result.val[1]);
}
- if(is_sat)
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- tmp = utility::clamp<int64_t, int32_t>(tmp);
+ int64_t tmp =
+ static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
+
+ if (tmp >= 0)
+ {
+ tmp >>= n;
+ }
+ else
+ {
+ uint64_t mask = ((uint64_t)1u << n) - 1;
+ tmp = (tmp + static_cast<int64_t>(mask)) >> n;
+ }
+ if (is_sat)
+ {
+ tmp = utility::clamp<int64_t, int32_t>(tmp);
+ }
+ *(output_ptr + x) = static_cast<int32_t>(tmp);
}
- *(output_ptr + x) = static_cast<int32_t>(tmp);
- }
- },
- broadcast_input, non_broadcast_input, dst);
+ },
+ broadcast_input, non_broadcast_input, dst);
}
else
{
@@ -1140,58 +1120,53 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
Iterator dst(out, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const int32x4x2_t ta1 =
+ const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const int32x4x2_t ta1 = {{
+ vld1q_s32(input1_ptr + x),
+ vld1q_s32(input1_ptr + x + 4),
+ }};
+ const int32x4x2_t ta2 = {{
+ vld1q_s32(input2_ptr + x),
+ vld1q_s32(input2_ptr + x + 4),
+ }};
+ const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
+
+ vst1q_s32(output_ptr + x, result.val[0]);
+ vst1q_s32(output_ptr + x + 4, result.val[1]);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
+ int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
+
+ if (tmp >= 0)
{
- vld1q_s32(input1_ptr + x),
- vld1q_s32(input1_ptr + x + 4),
+ tmp >>= n;
}
- };
- const int32x4x2_t ta2 =
- {
+ else
{
- vld1q_s32(input2_ptr + x),
- vld1q_s32(input2_ptr + x + 4),
+ uint64_t mask = ((uint64_t)1u << n) - 1;
+ tmp = (tmp + static_cast<int64_t>(mask)) >> n;
}
- };
- const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
-
- vst1q_s32(output_ptr + x, result.val[0]);
- vst1q_s32(output_ptr + x + 4, result.val[1]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
-
- if(tmp >= 0)
- {
- tmp >>= n;
- }
- else
- {
- uint64_t mask = ((uint64_t)1u << n) - 1;
- tmp = (tmp + static_cast<int64_t>(mask)) >> n;
- }
- if(is_sat)
- {
- tmp = utility::clamp<int64_t, int32_t>(tmp);
+ if (is_sat)
+ {
+ tmp = utility::clamp<int64_t, int32_t>(tmp);
+ }
+ *(output_ptr + x) = static_cast<int32_t>(tmp);
}
- *(output_ptr + x) = static_cast<int32_t>(tmp);
- }
- },
- input1, input2, dst);
+ },
+ input1, input2, dst);
}
}
@@ -1212,7 +1187,7 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con
using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1228,32 +1203,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con
Iterator dst(out, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
- const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
- const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
+ const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
+ const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+ const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
- auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
- wrapper::vstore(output_ptr + x, res);
- }
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+ auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
+ wrapper::vstore(output_ptr + x, res);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
- }
- },
- broadcast_input, non_broadcast_input, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+ *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
+ }
+ },
+ broadcast_input, non_broadcast_input, dst);
}
else
{
@@ -1266,32 +1242,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con
Iterator dst(out, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const auto ta1 = wrapper::vloadq(input1_ptr + x);
- const auto ta2 = wrapper::vloadq(input2_ptr + x);
- const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
- const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
- wrapper::vstore(output_ptr + x, res);
- }
+ const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto ta1 = *(input1_ptr + x);
- const auto ta2 = *(input2_ptr + x);
- *(output_ptr + x) = ta1 * ta2 * scale;
- }
- },
- input1, input2, dst);
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto ta1 = wrapper::vloadq(input1_ptr + x);
+ const auto ta2 = wrapper::vloadq(input2_ptr + x);
+ const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
+ const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
+ wrapper::vstore(output_ptr + x, res);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto ta1 = *(input1_ptr + x);
+ const auto ta2 = *(input2_ptr + x);
+ *(output_ptr + x) = ta1 * ta2 * scale;
+ }
+ },
+ input1, input2, dst);
}
}
@@ -1312,7 +1289,7 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1328,48 +1305,49 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
Iterator dst(out, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
- const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
+ const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
- float32x4_t b = vdupq_n_f32(broadcast_value);
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
+ float32x4_t b = vdupq_n_f32(broadcast_value);
- const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
- const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
- const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
- const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
- const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+ const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+ const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+ const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+ const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+ const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
- const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
- const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+ const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+ const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
- float32x4_t res = wrapper::vmul(tmp0, b);
- b = wrapper::vmul(b, mask);
+ float32x4_t res = wrapper::vmul(tmp0, b);
+ b = wrapper::vmul(b, mask);
- res = wrapper::vmla(res, tmp1, b);
- wrapper::vstore(output_ptr + 2 * x, res);
- }
+ res = wrapper::vmla(res, tmp1, b);
+ wrapper::vstore(output_ptr + 2 * x, res);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
- const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
- auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
- auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
- *(output_ptr + 2 * x) = res1;
- *(output_ptr + 2 * x + 1) = res2;
- }
- },
- broadcast_input, non_broadcast_input, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
+ const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
+ auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
+ auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
+ *(output_ptr + 2 * x) = res1;
+ *(output_ptr + 2 * x + 1) = res2;
+ }
+ },
+ broadcast_input, non_broadcast_input, dst);
}
else
{
@@ -1382,51 +1360,52 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
Iterator dst(out, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
- float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
+ const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
- const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
- const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
- const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
- const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
- const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
+ float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
- const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
- const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+ const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+ const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+ const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+ const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+ const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
- float32x4_t res = wrapper::vmul(tmp0, b);
+ const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+ const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
- b = wrapper::vrev64(b);
- b = wrapper::vmul(b, mask);
+ float32x4_t res = wrapper::vmul(tmp0, b);
- res = wrapper::vmla(res, tmp1, b);
- wrapper::vstore(output_ptr + 2 * x, res);
- }
+ b = wrapper::vrev64(b);
+ b = wrapper::vmul(b, mask);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto a0 = *(input1_ptr + 2 * x);
- const auto a1 = *(input1_ptr + 2 * x + 1);
- const auto b0 = *(input2_ptr + 2 * x);
- const auto b1 = *(input2_ptr + 2 * x + 1);
- auto res1 = a0 * b0 - a1 * b1;
- auto res2 = a0 * b1 + a1 * b0;
- *(output_ptr + 2 * x) = res1;
- *(output_ptr + 2 * x + 1) = res2;
- }
- },
- input1, input2, dst);
+ res = wrapper::vmla(res, tmp1, b);
+ wrapper::vstore(output_ptr + 2 * x, res);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto a0 = *(input1_ptr + 2 * x);
+ const auto a1 = *(input1_ptr + 2 * x + 1);
+ const auto b0 = *(input2_ptr + 2 * x);
+ const auto b1 = *(input2_ptr + 2 * x + 1);
+ auto res1 = a0 * b0 - a1 * b1;
+ auto res2 = a0 * b1 + a1 * b0;
+ *(output_ptr + 2 * x) = res1;
+ *(output_ptr + 2 * x + 1) = res2;
+ }
+ },
+ input1, input2, dst);
}
}
@@ -1444,7 +1423,7 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1457,48 +1436,40 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator dst(out, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
- const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
- const float16x8x2_t broadcast_value_vec =
+ win,
+ [&](const Coordinates &)
{
- {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+ const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
+ const float16x8x2_t broadcast_value_vec = {{
vdupq_n_f16(broadcast_value),
vdupq_n_f16(broadcast_value),
- }
- };
- const auto scale_vec = vdupq_n_f16(scale);
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float16x8x2_t non_broadcast_v =
+ }};
+ const auto scale_vec = vdupq_n_f16(scale);
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const float16x8x2_t non_broadcast_v = {{
vld1q_f16(non_broadcast_input_ptr + x),
vld1q_f16(non_broadcast_input_ptr + x + 8),
- }
- };
- const float16x8x2_t result =
+ }};
+ const float16x8x2_t result = {{
+ vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
+ vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
+ }};
+ vst1q_f16(output_ptr + x, result.val[0]);
+ vst1q_f16(output_ptr + x + 8, result.val[1]);
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- {
- vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
- vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
- }
- };
- vst1q_f16(output_ptr + x, result.val[0]);
- vst1q_f16(output_ptr + x + 8, result.val[1]);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
- }
- },
- broadcast_input, non_broadcast_input, dst);
+ const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+ *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
+ }
+ },
+ broadcast_input, non_broadcast_input, dst);
}
else
{
@@ -1508,49 +1479,41 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con
Iterator input2(src2, input2_win);
Iterator dst(out, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float16x8x2_t ta1 =
+ win,
+ [&](const Coordinates &)
+ {
+ const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_f16(input1_ptr + x),
- vld1q_f16(input1_ptr + x + 8),
- }
- };
- const float16x8x2_t ta2 =
- {
- {
- vld1q_f16(input2_ptr + x),
- vld1q_f16(input2_ptr + x + 8),
- }
- };
- const float16x8_t scale_vec = vdupq_n_f16(scale);
- const float16x8x2_t result =
+ const float16x8x2_t ta1 = {{
+ vld1q_f16(input1_ptr + x),
+ vld1q_f16(input1_ptr + x + 8),
+ }};
+ const float16x8x2_t ta2 = {{
+ vld1q_f16(input2_ptr + x),
+ vld1q_f16(input2_ptr + x + 8),
+ }};
+ const float16x8_t scale_vec = vdupq_n_f16(scale);
+ const float16x8x2_t result = {{
+ vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
+ vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
+ }};
+ vst1q_f16(output_ptr + x, result.val[0]);
+ vst1q_f16(output_ptr + x + 8, result.val[1]);
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- {
- vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
- vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
- }
- };
- vst1q_f16(output_ptr + x, result.val[0]);
- vst1q_f16(output_ptr + x + 8, result.val[1]);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto ta1 = *(input1_ptr + x);
- const auto ta2 = *(input2_ptr + x);
- *(output_ptr + x) = ta1 * ta2 * scale;
- }
- },
- input1, input2, dst);
+ const auto ta1 = *(input1_ptr + x);
+ const auto ta2 = *(input2_ptr + x);
+ *(output_ptr + x) = ta1 * ta2 * scale;
+ }
+ },
+ input1, input2, dst);
}
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -1577,81 +1540,82 @@ void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const
const auto window_end_x = static_cast<int>(window.x().end());
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
- const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
-
- uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
- uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
- tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
- tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+ const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
- if(is_scale255)
- {
- tmp_low = scale255_U16_U16(tmp_low);
- tmp_high = scale255_U16_U16(tmp_high);
- }
- else
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const int16x8_t vn = vdupq_n_s16(-n);
+ const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
+ const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
- if(is_sat)
+ uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
+ uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
+ tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
+ tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+
+ if (is_scale255)
{
- tmp_low = vqshlq_u16(tmp_low, vn);
- tmp_high = vqshlq_u16(tmp_high, vn);
+ tmp_low = scale255_U16_U16(tmp_low);
+ tmp_high = scale255_U16_U16(tmp_high);
}
else
{
- tmp_low = vshlq_u16(tmp_low, vn);
- tmp_high = vshlq_u16(tmp_high, vn);
+ const int16x8_t vn = vdupq_n_s16(-n);
+
+ if (is_sat)
+ {
+ tmp_low = vqshlq_u16(tmp_low, vn);
+ tmp_high = vqshlq_u16(tmp_high, vn);
+ }
+ else
+ {
+ tmp_low = vshlq_u16(tmp_low, vn);
+ tmp_high = vshlq_u16(tmp_high, vn);
+ }
}
- }
- if(is_sat)
- {
- static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
+ if (is_sat)
+ {
+ static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
- tmp_low = vminq_u16(tmp_low, max);
- tmp_high = vminq_u16(tmp_high, max);
+ tmp_low = vminq_u16(tmp_low, max);
+ tmp_high = vminq_u16(tmp_high, max);
+ }
+
+ vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
+ vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
}
- vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
- vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
- }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+ if (is_scale255)
+ {
+ float tmp_f = static_cast<float>(tmp) * scale255_constant;
+ tmp = static_cast<int32_t>(tmp_f + 0.5f);
+ }
+ else
+ {
+ tmp >>= n;
+ }
- if(is_scale255)
- {
- float tmp_f = static_cast<float>(tmp) * scale255_constant;
- tmp = static_cast<int32_t>(tmp_f + 0.5f);
- }
- else
- {
- tmp >>= n;
- }
+ if (is_sat)
+ {
+ tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
+ }
- if(is_sat)
- {
- tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
+ *(output_ptr + x) = static_cast<int16_t>(tmp);
}
-
- *(output_ptr + x) = static_cast<int16_t>(tmp);
- }
- },
- input1, input2, dst);
+ },
+ input1, input2, dst);
}
template <bool is_scale255, bool is_sat>
@@ -1676,75 +1640,65 @@ void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons
const auto window_end_x = static_cast<int>(window.x().end());
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const int16x8x2_t ta1 =
- {
- {
- vld1q_s16(input1_ptr + x),
- vld1q_s16(input1_ptr + x + 8),
- }
- };
- const uint8x8x2_t ta2u =
+ const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const int16x8x2_t ta1 = {{
+ vld1q_s16(input1_ptr + x),
+ vld1q_s16(input1_ptr + x + 8),
+ }};
+ const uint8x8x2_t ta2u = {{
vld1_u8(input2_ptr + x),
vld1_u8(input2_ptr + x + 8),
- }
- };
- const int16x8x2_t ta2 =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
- vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
- }
- };
-
- const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+ }};
+ const int16x8x2_t ta2 = {
+ {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}};
- vst1q_s16(output_ptr + x, result.val[0]);
- vst1q_s16(output_ptr + x + 8, result.val[1]);
- }
+ const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+ vst1q_s16(output_ptr + x, result.val[0]);
+ vst1q_s16(output_ptr + x + 8, result.val[1]);
+ }
- if(is_scale255)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- float tmp_f = static_cast<float>(tmp) * scale255_constant;
+ int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
- tmp = static_cast<int32_t>(tmp_f + 0.5f);
- }
- else
- {
- if(tmp >= 0)
+ if (is_scale255)
{
- tmp >>= n;
+ float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+ tmp = static_cast<int32_t>(tmp_f + 0.5f);
}
else
{
- uint32_t mask = (1u << n) - 1;
- tmp = (tmp + static_cast<int32_t>(mask)) >> n;
+ if (tmp >= 0)
+ {
+ tmp >>= n;
+ }
+ else
+ {
+ uint32_t mask = (1u << n) - 1;
+ tmp = (tmp + static_cast<int32_t>(mask)) >> n;
+ }
+ }
+ if (is_sat)
+ {
+ tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
}
+ *(output_ptr + x) = static_cast<int16_t>(tmp);
}
- if(is_sat)
- {
- tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
- }
- *(output_ptr + x) = static_cast<int16_t>(tmp);
- }
- },
- input1, input2, dst);
+ },
+ input1, input2, dst);
}
template <bool is_scale255, bool is_sat>
@@ -1755,7 +1709,12 @@ void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons
}
} // namespace
-void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void CpuMulKernel::configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy)
{
ARM_COMPUTE_UNUSED(rounding_policy);
ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
@@ -1775,7 +1734,7 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
bool is_scale_255 = false;
// Check and validate scaling factor
- if(std::abs(scale - scale255_constant) < 0.00001f)
+ if (std::abs(scale - scale255_constant) < 0.00001f)
{
is_scale_255 = true;
}
@@ -1795,12 +1754,12 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
const DataType dt_output = dst->data_type();
const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
- switch(dt_input1)
+ switch (dt_input1)
{
case DataType::QASYMM8:
- if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
+ if (dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
{
- if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+ if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
{
_func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
}
@@ -1811,9 +1770,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
}
break;
case DataType::QASYMM8_SIGNED:
- if(dt_input2 == DataType::QASYMM8_SIGNED)
+ if (dt_input2 == DataType::QASYMM8_SIGNED)
{
- if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+ if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
{
_func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
}
@@ -1824,19 +1783,19 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
}
break;
case DataType::QSYMM16:
- if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
+ if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
{
_func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
}
- else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
+ else if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
{
_func_int = &mul_QSYMM16_QSYMM16_S32;
}
break;
case DataType::S16:
- if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+ if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
{
- if(is_scale_255)
+ if (is_scale_255)
{
_func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
}
@@ -1845,9 +1804,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
_func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
}
}
- if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+ if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
{
- if(is_scale_255)
+ if (is_scale_255)
{
_func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
}
@@ -1858,15 +1817,15 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
}
break;
case DataType::S32:
- if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
+ if (DataType::S32 == dt_input2 && DataType::S32 == dt_output)
{
_func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
}
break;
case DataType::U8:
- if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+ if (DataType::U8 == dt_input2 && DataType::U8 == dt_output)
{
- if(is_scale_255)
+ if (is_scale_255)
{
_func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
}
@@ -1875,9 +1834,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
_func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
}
}
- else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+ else if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
{
- if(is_scale_255)
+ if (is_scale_255)
{
_func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
}
@@ -1886,9 +1845,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
_func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
}
}
- else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+ else if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
{
- if(is_scale_255)
+ if (is_scale_255)
{
_func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
}
@@ -1922,20 +1881,20 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
ARM_COMPUTE_UNUSED(thread_count);
#if defined(ENABLE_FP32_KERNELS)
- if(this->_func_float == &mul_F32_F32_F32)
+ if (this->_func_float == &mul_F32_F32_F32)
{
size_t mws = ICPPKernel::default_mws;
- if(platform.get_cpu_model() == CPUModel::N1)
+ if (platform.get_cpu_model() == CPUModel::N1)
{
mws = default_mws_N1_fp32_neon;
}
- else if(platform.get_cpu_model() == CPUModel::V1)
+ else if (platform.get_cpu_model() == CPUModel::V1)
{
mws = default_mws_V1_fp32_neon;
}
else
{
- if(_split_dimension == Window::DimX)
+ if (_split_dimension == Window::DimX)
{
// Don't split the work load too small if the tensor has been reinterpreted as 1D.
// This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -1945,7 +1904,7 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
}
// tensor is 1D or was re-interpreted as 1D
- if(this->window().shape().num_dimensions() == 1)
+ if (this->window().shape().num_dimensions() == 1)
{
return mws;
}
@@ -1958,10 +1917,10 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
return std::max(static_cast<size_t>(1), mws);
}
}
-#else /* ENABLE_FP32_KERNELS */
+#else /* ENABLE_FP32_KERNELS */
ARM_COMPUTE_UNUSED(platform);
#endif /* ENABLE_FP32_KERNELS */
- if(_split_dimension == Window::DimX)
+ if (_split_dimension == Window::DimX)
{
// Don't split the work load too small if the tensor has been reinterpreted as 1D.
// This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -1970,8 +1929,12 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
return default_mws;
}
-Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
- RoundingPolicy rounding_policy)
+Status CpuMulKernel::validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
@@ -1989,11 +1952,11 @@ void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const Thre
auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- if(_func_quantized != nullptr)
+ if (_func_quantized != nullptr)
{
(*_func_quantized)(src1, src2, dst, window, _scale);
}
- else if(_func_int != nullptr)
+ else if (_func_int != nullptr)
{
(*_func_int)(src1, src2, dst, window, _scale_exponent);
}
@@ -2021,10 +1984,11 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
// Validate in case of configured dst
- if(dst->total_size() > 0)
+ if (dst->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+ "Wrong shape for dst");
}
return Status{};
diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h
index 9e4a37110b..7eaf287507 100644
--- a/src/cpu/kernels/CpuMulKernel.h
+++ b/src/cpu/kernels/CpuMulKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_MUL_KERNEL_H
#include "arm_compute/core/Rounding.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -68,17 +69,27 @@ public:
* @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
* @param[in] rounding_policy Rounding policy.
*/
- void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+ void configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuMulKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+ static Status validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy);
// Inherited methods overridden
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
/** Return minimum workload size of the relevant kernel
@@ -108,7 +119,8 @@ private:
* @param[in] window Region on which to execute the kernel
* @param[in] scale Integer scale factor.
*/
- using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
+ using MulFunctionInt =
+ void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
/** Common signature for all the specialised multiplication functions with float scaling factor
*
* @param[in] src1 Src1 tensor object.
@@ -117,7 +129,8 @@ private:
* @param[in] window Region on which to execute the kernel
* @param[in] scale Float scale factor.
*/
- using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+ using MulFunctionFloat =
+ void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
/** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
*
* @param[in] src1 Src1 tensor object.
@@ -127,14 +140,15 @@ private:
* @param[in] scale Float scale factor.
*
*/
- using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+ using MulFunctionQuantized =
+ void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
- MulFunctionFloat *_func_float{ nullptr };
- MulFunctionInt *_func_int{ nullptr };
- MulFunctionQuantized *_func_quantized{ nullptr };
- float _scale{ 0 };
- int _scale_exponent{ 0 };
- size_t _split_dimension{ Window::DimY };
+ MulFunctionFloat *_func_float{nullptr};
+ MulFunctionInt *_func_int{nullptr};
+ MulFunctionQuantized *_func_quantized{nullptr};
+ float _scale{0};
+ int _scale_exponent{0};
+ size_t _split_dimension{Window::DimY};
};
/** Interface for the complex pixelwise multiplication kernel. */
@@ -159,7 +173,7 @@ public:
static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp
index d65e011032..b444a25ff7 100644
--- a/src/cpu/kernels/CpuPermuteKernel.cpp
+++ b/src/cpu/kernels/CpuPermuteKernel.cpp
@@ -28,8 +28,9 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -48,56 +49,31 @@ namespace
{
inline bool is_permutation_supported(const PermutationVector &v)
{
- static const std::array<PermutationVector, 2> permutations2 =
- {
- {
- PermutationVector(0U, 1U),
- PermutationVector(1U, 0U),
- }
- };
- static const std::array<PermutationVector, 6> permutations3 =
- {
- {
- PermutationVector(2U, 0U, 1U),
- PermutationVector(1U, 2U, 0U),
- PermutationVector(0U, 1U, 2U),
- PermutationVector(0U, 2U, 1U),
- PermutationVector(1U, 0U, 2U),
- PermutationVector(2U, 1U, 0U),
- }
- };
- static const std::array<PermutationVector, 24> permutations4 =
- {
- {
- PermutationVector(0U, 1U, 2U, 3U),
- PermutationVector(1U, 0U, 2U, 3U),
- PermutationVector(2U, 0U, 1U, 3U),
- PermutationVector(0U, 2U, 1U, 3U),
- PermutationVector(1U, 2U, 0U, 3U),
- PermutationVector(2U, 1U, 0U, 3U),
- PermutationVector(2U, 1U, 3U, 0U),
- PermutationVector(1U, 2U, 3U, 0U),
- PermutationVector(3U, 2U, 1U, 0U),
- PermutationVector(2U, 3U, 1U, 0U),
- PermutationVector(1U, 3U, 2U, 0U),
- PermutationVector(3U, 1U, 2U, 0U),
- PermutationVector(3U, 0U, 2U, 1U),
- PermutationVector(0U, 3U, 2U, 1U),
- PermutationVector(2U, 3U, 0U, 1U),
- PermutationVector(3U, 2U, 0U, 1U),
- PermutationVector(0U, 2U, 3U, 1U),
- PermutationVector(2U, 0U, 3U, 1U),
- PermutationVector(1U, 0U, 3U, 2U),
- PermutationVector(0U, 1U, 3U, 2U),
- PermutationVector(3U, 1U, 0U, 2U),
- PermutationVector(1U, 3U, 0U, 2U),
- PermutationVector(0U, 3U, 1U, 2U),
- PermutationVector(3U, 0U, 1U, 2U)
- }
- };
+ static const std::array<PermutationVector, 2> permutations2 = {{
+ PermutationVector(0U, 1U),
+ PermutationVector(1U, 0U),
+ }};
+ static const std::array<PermutationVector, 6> permutations3 = {{
+ PermutationVector(2U, 0U, 1U),
+ PermutationVector(1U, 2U, 0U),
+ PermutationVector(0U, 1U, 2U),
+ PermutationVector(0U, 2U, 1U),
+ PermutationVector(1U, 0U, 2U),
+ PermutationVector(2U, 1U, 0U),
+ }};
+ static const std::array<PermutationVector, 24> permutations4 = {
+ {PermutationVector(0U, 1U, 2U, 3U), PermutationVector(1U, 0U, 2U, 3U), PermutationVector(2U, 0U, 1U, 3U),
+ PermutationVector(0U, 2U, 1U, 3U), PermutationVector(1U, 2U, 0U, 3U), PermutationVector(2U, 1U, 0U, 3U),
+ PermutationVector(2U, 1U, 3U, 0U), PermutationVector(1U, 2U, 3U, 0U), PermutationVector(3U, 2U, 1U, 0U),
+ PermutationVector(2U, 3U, 1U, 0U), PermutationVector(1U, 3U, 2U, 0U), PermutationVector(3U, 1U, 2U, 0U),
+ PermutationVector(3U, 0U, 2U, 1U), PermutationVector(0U, 3U, 2U, 1U), PermutationVector(2U, 3U, 0U, 1U),
+ PermutationVector(3U, 2U, 0U, 1U), PermutationVector(0U, 2U, 3U, 1U), PermutationVector(2U, 0U, 3U, 1U),
+ PermutationVector(1U, 0U, 3U, 2U), PermutationVector(0U, 1U, 3U, 2U), PermutationVector(3U, 1U, 0U, 2U),
+ PermutationVector(1U, 3U, 0U, 2U), PermutationVector(0U, 3U, 1U, 2U), PermutationVector(3U, 0U, 1U, 2U)}};
- return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v))
- || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
+ return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) ||
+ (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) ||
+ (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
}
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
@@ -108,7 +84,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
// Validate configured destination
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -128,18 +104,22 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
// we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
// we have to fall back to C++
- if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }))
+ if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) ||
+ (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}))
{
- window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
- window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
- window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
+ window_src.set(Window::DimX,
+ Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
+ window_src.set(Window::DimY,
+ Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
+ window_src.set(Window::DimZ,
+ Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
}
// Destination window
Window window_dst(window);
const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
- for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
+ for (size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
{
window_dst.set(d, zero_window);
}
@@ -157,7 +137,7 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
int n_channels = 0;
int n_batches = 0;
- switch(src_layout)
+ switch (src_layout)
{
case DataLayout::NCHW:
{
@@ -189,38 +169,42 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
}
// CHW -> HWC
- if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U })
+ if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U})
{
const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
const int out_col_stride = dst->info()->strides_in_bytes().y() / sizeof(T);
const int out_row_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T);
- execute_window_loop(window_src, [&](const Coordinates & id)
- {
- const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
- reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
- n_batches, n_channels, n_rows, n_cols,
- in_batch_stride, in_channel_stride, in_row_stride,
- out_batch_stride, out_row_stride, out_col_stride);
- },
- src_it, dst_it);
+ execute_window_loop(
+ window_src,
+ [&](const Coordinates &id)
+ {
+ const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
+ reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()),
+ reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_channels, n_rows, n_cols,
+ in_batch_stride, in_channel_stride, in_row_stride, out_batch_stride,
+ out_row_stride, out_col_stride);
+ },
+ src_it, dst_it);
}
// HWC -> CHW
- else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })
+ else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})
{
const int out_col_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
const int out_row_stride = dst->info()->strides_in_bytes().y() / sizeof(T);
const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T);
- execute_window_loop(window_src, [&](const Coordinates & id)
- {
- const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
- reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
- n_batches, n_rows, n_cols, n_channels,
- in_batch_stride, in_row_stride, in_col_stride,
- out_batch_stride, out_channel_stride, out_row_stride);
- },
- src_it, dst_it);
+ execute_window_loop(
+ window_src,
+ [&](const Coordinates &id)
+ {
+ const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
+ reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()),
+ reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_rows, n_cols, n_channels,
+ in_batch_stride, in_row_stride, in_col_stride, out_batch_stride,
+ out_channel_stride, out_row_stride);
+ },
+ src_it, dst_it);
}
else
{
@@ -230,12 +214,15 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
Strides perm_strides = strides;
permute_strides(perm_strides, perm);
const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
- *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
- },
- src_it, dst_it);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int idx =
+ id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
+ *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
+ },
+ src_it, dst_it);
}
}
} // namespace
@@ -275,7 +262,7 @@ void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- switch(src->info()->element_size())
+ switch (src->info()->element_size())
{
case 1:
run_permute<uint8_t>(window, src, dst, _perm);
diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h
index 9e1b93318e..0cb2faf223 100644
--- a/src/cpu/kernels/CpuPermuteKernel.h
+++ b/src/cpu/kernels/CpuPermuteKernel.h
@@ -57,7 +57,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp
index d72a41cbbe..9308d860d1 100644
--- a/src/cpu/kernels/CpuPool2dKernel.cpp
+++ b/src/cpu/kernels/CpuPool2dKernel.cpp
@@ -25,15 +25,17 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/pool2d/neon/list.h"
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
#include <arm_neon.h>
namespace arm_compute
@@ -46,99 +48,111 @@ namespace
{
using namespace misc::shape_calculator;
-static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels =
-{
- {
- "neon_qu8_nhwc_poolMxN",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
- },
- {
- "neon_qs8_nhwc_poolMxN",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
- },
- {
- "neon_f16_nhwc_poolMxN",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
- },
- {
- "neon_fp32_nhwc_poolMxN",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
- REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
- },
+static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels = {
+ {"neon_qu8_nhwc_poolMxN",
+ [](const PoolDataTypeISASelectorData &data)
+ { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)},
+ {"neon_qs8_nhwc_poolMxN",
+ [](const PoolDataTypeISASelectorData &data)
+ { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)},
+ {"neon_f16_nhwc_poolMxN",
+ [](const PoolDataTypeISASelectorData &data)
+ { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)},
+ {"neon_fp32_nhwc_poolMxN",
+ [](const PoolDataTypeISASelectorData &data)
+ { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
+ REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)},
#if defined(ENABLE_NCHW_KERNELS)
- {
- "neon_qu8_nchw_pool2",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
- },
- {
- "neon_qu8_nchw_pool3",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
- },
- {
- "neon_qu8_nchw_poolMxN",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
- },
- {
- "neon_qs8_nchw_pool2",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
- },
- {
- "neon_qs8_nchw_pool3",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
- },
- {
- "neon_qs8_nchw_poolMxN",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
- },
- {
- "neon_fp16_nchw_pool2",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
- REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
- },
- {
- "neon_fp16_nchw_pool3",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
- REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
- },
- {
- "neon_fp16_nchw_poolMxN",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
- REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
- },
- {
- "neon_fp32_nchw_pool2",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
- REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
- },
- {
- "neon_fp32_nchw_pool3",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
- REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
- },
- {
- "neon_fp32_nchw_pool7",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
- REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
- },
- {
- "neon_fp32_nchw_poolMxN",
- [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
- REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
- },
+ {"neon_qu8_nchw_pool2",
+ [](const PoolDataTypeISASelectorData &data)
+ {
+ return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+ (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+ },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)},
+ {"neon_qu8_nchw_pool3",
+ [](const PoolDataTypeISASelectorData &data)
+ {
+ return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+ (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+ },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)},
+ {"neon_qu8_nchw_poolMxN",
+ [](const PoolDataTypeISASelectorData &data)
+ { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)},
+ {"neon_qs8_nchw_pool2",
+ [](const PoolDataTypeISASelectorData &data)
+ {
+ return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+ (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+ },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)},
+ {"neon_qs8_nchw_pool3",
+ [](const PoolDataTypeISASelectorData &data)
+ {
+ return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+ (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+ },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)},
+ {"neon_qs8_nchw_poolMxN",
+ [](const PoolDataTypeISASelectorData &data)
+ { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)},
+ {"neon_fp16_nchw_pool2",
+ [](const PoolDataTypeISASelectorData &data)
+ {
+ return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+ (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)},
+ {"neon_fp16_nchw_pool3",
+ [](const PoolDataTypeISASelectorData &data)
+ {
+ return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+ (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)},
+ {"neon_fp16_nchw_poolMxN",
+ [](const PoolDataTypeISASelectorData &data)
+ { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
+ REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)},
+ {"neon_fp32_nchw_pool2",
+ [](const PoolDataTypeISASelectorData &data)
+ {
+ return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+ (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)},
+ {"neon_fp32_nchw_pool3",
+ [](const PoolDataTypeISASelectorData &data)
+ {
+ return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+ (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)},
+ {"neon_fp32_nchw_pool7",
+ [](const PoolDataTypeISASelectorData &data)
+ {
+ return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+ (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7));
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)},
+ {"neon_fp32_nchw_poolMxN",
+ [](const PoolDataTypeISASelectorData &data)
+ { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
+ REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)},
#endif /* defined(ENABLE_NCHW_KERNELS) */
};
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
- const ITensorInfo *indices, Size2D pool_size)
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices,
+ Size2D pool_size)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
@@ -150,65 +164,78 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
int output_height = 0;
PoolingType pool_type = pool_info.pool_type;
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
- const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type()))
- && (is_pool_region_entirely_outside_input(pool_info)),
- "Pooling region that is entirely outside input tensor is unsupported for non-float types");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (!is_data_type_float(src->data_type())) && (is_pool_region_entirely_outside_input(pool_info)),
+ "Pooling region that is entirely outside input tensor is unsupported for non-float types");
- std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
- pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+ std::tie(output_width, output_height) =
+ scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size.x(),
+ pool_size.y(), pool_info.pad_stride_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+ "Calculated output dimension size is invalid");
TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- if(indices)
+ if (indices)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+ "Pooling indices only supported for MAX pooling method");
}
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
- && (src->data_layout() == DataLayout::NHWC),
- "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding &&
+ (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() &&
+ (src->data_layout() == DataLayout::NHWC),
+ "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
- if(indices)
+ if (indices)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices), "Pooling indices returning source tensor coordinates is only supported for pool size 2x2");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC), "Pooling kernel indices only supported for NHWC");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ ((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices),
+ "Pooling indices returning source tensor coordinates is only supported for pool size 2x2");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC),
+ "Pooling kernel indices only supported for NHWC");
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
}
}
- const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa() });
+ const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{
+ src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
- unsigned int &num_elems_processed_per_iteration,
- int pool_size_x, int pool_size_y)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src,
+ ITensorInfo *dst,
+ ITensorInfo *indices,
+ const PoolingLayerInfo &pool_info,
+ unsigned int &num_elems_processed_per_iteration,
+ int pool_size_x,
+ int pool_size_y)
{
// dst auto inizialitation if not yet initialized
auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
- if(indices)
+ if (indices)
{
// Indices auto inizialitation if not yet initialized
- auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
- pool_info)))
- .set_data_type(DataType::U32) /* we store the offset to the element */);
+ auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)))
+ .set_data_type(DataType::U32) /* we store the offset to the element */);
}
const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
@@ -219,20 +246,20 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- const bool is_square = pool_size_x == pool_size_y;
- const unsigned int pooled_w = dst->dimension(idx_width);
- const unsigned int pooled_h = dst->dimension(idx_height);
+ const bool is_square = pool_size_x == pool_size_y;
+ const unsigned int pooled_w = dst->dimension(idx_width);
+ const unsigned int pooled_h = dst->dimension(idx_height);
//If it's not squared and optimized will be executed the MxN
num_elems_processed_per_iteration = 1;
- if(is_square)
+ if (is_square)
{
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
- switch(pool_size_x)
+ switch (pool_size_x)
{
case 2:
num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
@@ -261,18 +288,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
bool window_changed = false;
Window win{};
// Upper limit for the number of right/bottom border elements that are accessed
- TensorShape dst_shape{ src->tensor_shape() };
+ TensorShape dst_shape{src->tensor_shape()};
dst_shape.set(0, pooled_w);
dst_shape.set(1, pooled_h);
TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ Status err =
+ (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
return std::make_pair(err, win);
}
} // namespace
-void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void CpuPool2dKernel::configure(ITensorInfo *src,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ ITensorInfo *indices)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
@@ -284,14 +315,15 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Update pool size in case of global pooling
- const Size2D pool_size(
- is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
- is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
+ const Size2D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+ is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
- const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, pool_size, CPUInfo::get().get_isa() });
+ const auto *uk = CpuPool2dKernel::get_implementation(
+ PoolDataTypeISASelectorData{src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first,
+ pool_size, CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON(uk == nullptr);
// Set instance variables
@@ -302,7 +334,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
_run_method = uk->ukernel;
_name = std::string("CpuPool2dKernel").append("/").append(uk->name);
- if(_data_layout == DataLayout::NHWC)
+ if (_data_layout == DataLayout::NHWC)
{
// Configure kernel window
Window win = calculate_max_window(*dst, Steps());
@@ -311,14 +343,17 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
else
{
// Configure kernel window
- auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
- pool_size.x(), pool_size.y());
+ auto win_config = validate_and_configure_window(
+ src, dst, indices, pool_info, _num_elems_processed_per_iteration, pool_size.x(), pool_size.y());
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
ICpuKernel::configure(win_config.second);
}
}
-Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2dKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
@@ -336,9 +371,10 @@ Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst,
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
- (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration,
- pool_size_x, pool_size_y)
- .first);
+ (indices) ? indices->clone().get() : nullptr, pool_info,
+ num_elems_processed_per_iteration, pool_size_x,
+ pool_size_y)
+ .first);
return Status{};
}
@@ -359,19 +395,20 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T
const unsigned int pool_size = _pool_info.pool_size.width;
Window window_src(window);
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
// Set step for src in x and y direction for the src
unsigned int window_x_inc = 0;
- switch(src->info()->data_type())
+ switch (src->info()->data_type())
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
{
window_x_inc = pool_stride_x;
- if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+ if ((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
{
- window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+ window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2
+ : _num_elems_processed_per_iteration;
}
break;
}
@@ -387,8 +424,10 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T
ARM_COMPUTE_ERROR("Not supported");
}
}
- window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
- window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+ window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x,
+ window.x().end() * pool_stride_x, window_x_inc));
+ window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y,
+ window.y().end() * pool_stride_y, pool_stride_y));
}
else
{
diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h
index c952ea839d..859de8cc5f 100644
--- a/src/cpu/kernels/CpuPool2dKernel.h
+++ b/src/cpu/kernels/CpuPool2dKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -38,7 +39,8 @@ namespace kernels
class CpuPool2dKernel : public ICpuKernel<CpuPool2dKernel>
{
private:
- using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
+ using PoolingKernelPtr = std::add_pointer<void(
+ const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
public:
CpuPool2dKernel() = default;
@@ -52,17 +54,21 @@ public:
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
* @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
*/
- void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+ void
+ configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuPool2dKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices = nullptr);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct PoolingKernel
@@ -76,11 +82,11 @@ public:
private:
PoolingLayerInfo _pool_info{};
- DataLayout _data_layout{ DataLayout::UNKNOWN };
- unsigned int _num_elems_processed_per_iteration{ 0 };
+ DataLayout _data_layout{DataLayout::UNKNOWN};
+ unsigned int _num_elems_processed_per_iteration{0};
Size2D _pool_size{};
int _pool_stride_x{};
- PoolingKernelPtr _run_method{ nullptr };
+ PoolingKernelPtr _run_method{nullptr};
std::string _name{};
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
index 4504f3f7c9..8b484d4e0b 100644
--- a/src/cpu/kernels/CpuPool3dKernel.cpp
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp
@@ -25,8 +25,9 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/pool3d/list.h"
@@ -41,39 +42,28 @@ namespace
{
using namespace misc::shape_calculator;
-static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels =
-{
- {
- "neon_qu8_ndhwc_poolMxNxD",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)
- },
- {
- "neon_qs8_ndhwc_poolMxNxD",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)
- },
- {
- "neon_fp16_ndhwc_poolMxNxD",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)
- },
- {
- "neon_fp32_ndhwc_poolMxNxD",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)
- }
-};
+static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels = {
+ {"neon_qu8_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)},
+ {"neon_qs8_ndhwc_poolMxNxD",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)},
+ {"neon_fp16_ndhwc_poolMxNxD",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.fp16); },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)},
+ {"neon_fp32_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)}};
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
- && (pool_info.pool_type == PoolingType::AVG)),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+ (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
"Exclude padding is unsupported for non-float types for Avg op");
const auto data_layout = src->data_layout();
@@ -97,21 +87,26 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
int output_height = 0;
int output_depth = 0;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+ "Pooling region that is entirely outside input tensor is unsupported");
- std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], src->tensor_shape()[idx_depth],
- pool_size_x, pool_size_y, pool_size_z, pool_info);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+ std::tie(output_width, output_height, output_depth) =
+ scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+ src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+ "Calculated output dimension size is invalid");
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
- TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
+ TensorInfo out_info(
+ TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
}
- const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ const auto *uk =
+ CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
return Status{};
@@ -136,12 +131,12 @@ void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
// Update pool size in case of global pooling
const bool is_global_pooling = pool_info.is_global_pooling;
- const Size3D pool_size(
- is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
- is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
- is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
+ const Size3D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+ is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
+ is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
- const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ const auto *uk =
+ CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON(uk == nullptr);
// Set instance variables
@@ -188,4 +183,4 @@ const std::vector<CpuPool3dKernel::Pooling3dKernel> &CpuPool3dKernel::get_availa
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
index 437f2af7e4..bd1ff61046 100644
--- a/src/cpu/kernels/CpuPool3dKernel.h
+++ b/src/cpu/kernels/CpuPool3dKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_POOL3D_KERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -39,7 +40,8 @@ class CpuPool3dKernel : public ICpuKernel<CpuPool3dKernel>
{
private:
/* Template function for Pooling 3D NDHWC */
- using Pooling3dKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
+ using Pooling3dKernelPtr =
+ std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
public:
CpuPool3dKernel() = default;
@@ -68,7 +70,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct Pooling3dKernel
@@ -82,11 +84,11 @@ public:
private:
Pooling3dLayerInfo _pool_info{};
- Pooling3dKernelPtr _run_method{ nullptr };
+ Pooling3dKernelPtr _run_method{nullptr};
std::string _name{};
};
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */ \ No newline at end of file
+#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index 9700c62318..5dde680837 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -28,13 +28,13 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/CPP/Validate.h"
#include <arm_neon.h>
#include <map>
@@ -53,9 +53,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::QASYMM16);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
return Status{};
@@ -71,19 +73,15 @@ inline float32x4x4_t load_value(const T *input_ptr)
template <>
inline float32x4x4_t load_value(const float *input_ptr)
{
- return { wrapper::vloadq(input_ptr),
- wrapper::vloadq(input_ptr + 4),
- wrapper::vloadq(input_ptr + 8),
- wrapper::vloadq(input_ptr + 12) };
+ return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8),
+ wrapper::vloadq(input_ptr + 12)};
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <>
inline float32x4x4_t load_value(const float16_t *input_ptr)
{
- return { vcvt_f32_f16(wrapper::vload(input_ptr)),
- vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
- vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
- vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
+ return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+ vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -113,26 +111,25 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
- static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map =
- {
- { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
- { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> },
- { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> },
+ static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = {
+ {"op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t>},
+ {"op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t>},
+ {"op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t>},
- { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> },
- { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
- { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
+ {"op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t>},
+ {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>},
+ {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>},
- { "op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t> },
+ {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>},
- { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
- { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
- { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
+ {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>},
+ {"op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t>},
+ {"op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float>},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> },
- { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> },
- { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> },
+ {"op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t>},
+ {"op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t>},
+ {"op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t>},
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
};
@@ -142,7 +139,7 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
auto it = quant_map.find(function_to_call);
- if(it == quant_map.end())
+ if (it == quant_map.end())
{
ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
}
@@ -167,7 +164,7 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
- if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+ if (is_data_type_quantized_asymmetric(src->info()->data_type()))
{
uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
}
@@ -177,22 +174,24 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- auto input_ptr = reinterpret_cast<const TIn *>(input.ptr());
- auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step); x += window_step)
- {
- wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
- }
- },
- input, output);
+ auto input_ptr = reinterpret_cast<const TIn *>(input.ptr());
+ auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
+ }
+ },
+ input, output);
}
template <typename TIn, typename TOut>
@@ -203,7 +202,7 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c
const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
- if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+ if (is_data_type_quantized_asymmetric(src->info()->data_type()))
{
uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
}
@@ -219,23 +218,25 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- auto input_ptr = reinterpret_cast<const TIn *>(input.ptr());
- auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step); x += window_step)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
- }
- },
- input, output);
+ auto input_ptr = reinterpret_cast<const TIn *>(input.ptr());
+ auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
+ }
+ },
+ input, output);
}
template <typename T>
@@ -246,7 +247,7 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst,
const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
- if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+ if (is_data_type_quantized_asymmetric(src->info()->data_type()))
{
uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
}
@@ -262,25 +263,27 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst,
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step); x += window_step)
- {
- uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
- vst1q_u16(&output_ptr[x], tmp.val[0]);
- vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
- }
- },
- input, output);
+ auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
+ vst1q_u16(&output_ptr[x], tmp.val[0]);
+ vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
+ }
+ },
+ input, output);
}
void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index 2bc8105a11..d6714136da 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -59,7 +59,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
@@ -67,7 +67,9 @@ private:
*
* @param[in] window Region on which to execute the kernel.
*/
- using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+ using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src,
+ ITensor *dst,
+ const Window &window);
/** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
*
* @param[in] window Region on which to execute the kernel.
@@ -84,7 +86,7 @@ private:
template <typename TIn, typename TOut>
void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window);
- QuantizeFunctionExecutorPtr _func{ nullptr };
+ QuantizeFunctionExecutorPtr _func{nullptr};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
index a9672a8c5e..241e58fbce 100644
--- a/src/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp
@@ -29,9 +29,11 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
+
#include "src/core/helpers/Utils.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+
#include <cstdint>
/** [NEReshapeLayerKernel Kernel] **/
@@ -49,7 +51,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
// Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
- if(dst->tensor_shape().total_size() != 0)
+ if (dst->tensor_shape().total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -59,29 +61,30 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
return Status{};
}
-
template <typename T>
void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst)
{
const TensorShape &src_shape = src->info()->tensor_shape();
const TensorShape &dst_shape = dst->info()->tensor_shape();
- Iterator dst_it(dst, window);
+ Iterator dst_it(dst, window);
- execute_window_loop(window, [&](const Coordinates & dst_coord)
- {
- Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
- const auto output_ptr = dst->ptr_to_element(dst_coord);
- const auto input_ptr = src->ptr_to_element(src_coord);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &dst_coord)
+ {
+ Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+ const auto output_ptr = dst->ptr_to_element(dst_coord);
+ const auto input_ptr = src->ptr_to_element(src_coord);
- *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
- },
- dst_it);
+ *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
+ },
+ dst_it);
}
-void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst )
+void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst)
{
- switch(src->info()->data_type())
+ switch (src->info()->data_type())
{
case DataType::U8:
case DataType::S8:
@@ -131,22 +134,24 @@ void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *d
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator dst_it(dst, win);
- execute_window_loop(win, [&]( Coordinates & id)
- {
- dst_coord = id;
-
- for(int x = window_start_x; x < window_end_x; x += src_row_size)
+ execute_window_loop(
+ win,
+ [&](Coordinates &id)
{
- src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
- output_ptr = dst->ptr_to_element(dst_coord);
- input_ptr = src->ptr_to_element(src_coord);
+ dst_coord = id;
- std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+ for (int x = window_start_x; x < window_end_x; x += src_row_size)
+ {
+ src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+ output_ptr = dst->ptr_to_element(dst_coord);
+ input_ptr = src->ptr_to_element(src_coord);
- dst_coord.increment(Window::DimX, src_row_size);
- }
- },
- dst_it);
+ std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+
+ dst_coord.increment(Window::DimX, src_row_size);
+ }
+ },
+ dst_it);
}
void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst)
@@ -213,8 +218,8 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors)
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- const ITensorInfo* src_info = src->info();
- const ITensorInfo* dst_info = dst->info();
+ const ITensorInfo *src_info = src->info();
+ const ITensorInfo *dst_info = dst->info();
// Calculate kernel window based on the padding info
Window win;
@@ -226,7 +231,7 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors)
const auto src_row_size = static_cast<int>(src_info->tensor_shape()[0]);
const auto dst_row_size = static_cast<int>(dst_info->tensor_shape()[0]);
- if(!src_has_holes && !dst_has_holes)
+ if (!src_has_holes && !dst_has_holes)
{
std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info);
/*
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index eddbbf7135..ce566fd9e2 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -55,7 +55,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
/** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes
@@ -84,10 +84,9 @@ public:
}
private:
- size_t _split_dimension{ Window::DimY };
-
- std::function<void(const Window &window, const ITensor *src, ITensor *dst )> _reshape_tensor_fn{};
+ size_t _split_dimension{Window::DimY};
+ std::function<void(const Window &window, const ITensor *src, ITensor *dst)> _reshape_tensor_fn{};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp
index 332304599f..702e0a8134 100644
--- a/src/cpu/kernels/CpuScaleKernel.cpp
+++ b/src/cpu/kernels/CpuScaleKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/InterpolationPolicyUtils.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/common/Registrars.h"
#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -44,104 +45,74 @@ namespace kernels
{
namespace
{
-static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels =
-{
- {
- "sve_fp16_scale",
- [](const ScaleKernelDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.interpolation_policy != InterpolationPolicy::BILINEAR;
- },
- REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
- },
- {
- "sve_fp32_scale",
- [](const ScaleKernelDataTypeISASelectorData & data)
- {
- return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
- },
- REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
- },
- {
- "sve_qu8_scale",
- [](const ScaleKernelDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
- },
- REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
- },
- {
- "sve_qs8_scale",
- [](const ScaleKernelDataTypeISASelectorData & data)
- {
- return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
- },
- REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
- },
- {
- "sve_u8_scale",
- [](const ScaleKernelDataTypeISASelectorData & data)
- {
- return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
- },
- REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
- },
- {
- "sve_s16_scale",
- [](const ScaleKernelDataTypeISASelectorData & data)
- {
- return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
- },
- REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
- },
- {
- "neon_fp16_scale",
- [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)
- },
- {
- "neon_fp32_scale",
- [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)
- },
- {
- "neon_qu8_scale",
- [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)
- },
- {
- "neon_qs8_scale",
- [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)
- },
- {
- "neon_u8_scale",
- [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::U8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)
- },
- {
- "neon_s8_scale",
- [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)
- },
- {
- "neon_s16_scale",
- [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)
- },
+static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels = {
+ {"sve_fp16_scale",
+ [](const ScaleKernelDataTypeISASelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+ data.interpolation_policy != InterpolationPolicy::BILINEAR;
+ },
+ REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)},
+ {"sve_fp32_scale",
+ [](const ScaleKernelDataTypeISASelectorData &data)
+ { return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+ REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)},
+ {"sve_qu8_scale",
+ [](const ScaleKernelDataTypeISASelectorData &data) {
+ return data.dt == DataType::QASYMM8 && data.isa.sve &&
+ data.interpolation_policy != InterpolationPolicy::BILINEAR;
+ },
+ REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)},
+ {"sve_qs8_scale",
+ [](const ScaleKernelDataTypeISASelectorData &data)
+ {
+ return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve &&
+ data.interpolation_policy != InterpolationPolicy::BILINEAR;
+ },
+ REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)},
+ {"sve_u8_scale",
+ [](const ScaleKernelDataTypeISASelectorData &data)
+ { return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+ REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)},
+ {"sve_s16_scale",
+ [](const ScaleKernelDataTypeISASelectorData &data)
+ { return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+ REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)},
+ {"neon_fp16_scale",
+ [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)},
+ {"neon_fp32_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)},
+ {"neon_qu8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)},
+ {"neon_qs8_scale",
+ [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)},
+ {"neon_u8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::U8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)},
+ {"neon_s8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)},
+ {"neon_s16_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)},
};
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy,
- const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info)
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *dx,
+ const ITensorInfo *dy,
+ const ITensorInfo *offsets,
+ ITensorInfo *dst,
+ const ScaleKernelInfo &info)
{
- const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy });
+ const auto *uk = CpuScaleKernel::get_implementation(
+ ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
- ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1);
- ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+ info.sampling_policy != SamplingPolicy::TOP_LEFT);
ARM_COMPUTE_UNUSED(info.constant_border_value);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
@@ -153,27 +124,30 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I
ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
- ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) && (data_layout != DataLayout::NHWC || info.interpolation_policy != InterpolationPolicy::BILINEAR
- || info.border_mode != BorderMode::REPLICATE));
+ ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) &&
+ (data_layout != DataLayout::NHWC ||
+ info.interpolation_policy != InterpolationPolicy::BILINEAR ||
+ info.border_mode != BorderMode::REPLICATE));
- if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr)
+ if (info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
}
- if(info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr)
+ if (info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
- if(dx != nullptr && dy != nullptr)
+ if (dx != nullptr && dy != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
}
}
- ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+ ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners &&
+ !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
- if(info.interpolation_policy == InterpolationPolicy::AREA)
+ if (info.interpolation_policy == InterpolationPolicy::AREA)
{
ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
@@ -183,24 +157,28 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I
}
} // namespace
-void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets,
- ITensorInfo *dst, const ScaleKernelInfo &info)
+void CpuScaleKernel::configure(const ITensorInfo *src,
+ const ITensorInfo *dx,
+ const ITensorInfo *dy,
+ const ITensorInfo *offsets,
+ ITensorInfo *dst,
+ const ScaleKernelInfo &info)
{
ARM_COMPUTE_UNUSED(dx, dy, offsets);
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
- dx,
- dy,
- offsets,
- dst,
- info));
-
- const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy });
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dx, dy, offsets, dst, info));
+
+ const auto *uk = CpuScaleKernel::get_implementation(
+ ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
_run_method = uk->ukernel;
- _name = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy));
+ _name = std::string("CpuScaleKernel")
+ .append("/")
+ .append(uk->name)
+ .append("_")
+ .append(string_from_interpolation_policy(info.interpolation_policy));
// Get data layout and width/height indices
_data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
@@ -212,19 +190,22 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
_constant_border_value = info.constant_border_value;
_align_corners = info.align_corners;
- if(info.sampling_policy == SamplingPolicy::CENTER)
+ if (info.sampling_policy == SamplingPolicy::CENTER)
{
_sampling_offset = 0.5f;
}
// Compute the ratio between source width/height and destination width/height
- const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
- const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
+ const auto wr =
+ scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
+ const auto hr =
+ scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
- _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
+ _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR
+ : _policy;
- if(_border_mode == BorderMode::UNDEFINED)
+ if (_border_mode == BorderMode::UNDEFINED)
{
_border_mode = BorderMode::CONSTANT;
_constant_border_value = PixelValue();
@@ -232,39 +213,38 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
#ifdef ENABLE_NCHW_KERNELS
// Configure scale function to run
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
std::string function_to_call("scale_");
function_to_call += string_from_data_type(src->data_type()) + "_";
function_to_call += string_from_data_layout(_data_layout) + "_";
function_to_call += string_from_interpolation_policy(_policy);
- static std::map<std::string, ScaleFunctionPtr> map_function =
- {
- { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 },
+ static std::map<std::string, ScaleFunctionPtr> map_function = {
+ {"scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8},
- { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t> },
- { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
+ {"scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t>},
+ {"scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t>},
- { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t> },
- { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
+ {"scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t>},
+ {"scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t>},
- { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t> },
- { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t> },
+ {"scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t>},
+ {"scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t>},
- { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t> },
- { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t> },
+ {"scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t>},
+ {"scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t>},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t> },
- { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t> },
+ {"scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t>},
+ {"scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t>},
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float> },
- { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float> },
+ {"scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float>},
+ {"scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float>},
};
auto it = map_function.find(function_to_call);
- if(it != map_function.end())
+ if (it != map_function.end())
{
_func = it->second;
}
@@ -278,13 +258,19 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
#ifdef ENABLE_NCHW_KERNELS
template <typename T>
-void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_nearest_nchw(const ITensor *src,
+ ITensor *dst,
+ const ITensor *dx,
+ const ITensor *dy,
+ const ITensor *offsets,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dx, dy);
const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
// Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
+ const auto hr =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
// Don't increment in X and Y direction for the input tensor
// A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -296,7 +282,7 @@ void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const
Window win_off;
win_off.set(Window::DimX, window[Window::DimX]);
win_off.set(Window::DimY, window[Window::DimY]);
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+ for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
{
win_off.set(d, Window::Dimension(0, 0, 0));
}
@@ -305,24 +291,33 @@ void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const
Iterator src_i(src, win_in);
Iterator dst_i(dst, window);
Iterator offsets_i(offsets, win_off);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
- const auto in_yi = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((
- id.y() + _sampling_offset)
- * hr));
- const int32_t offset_row = in_yi * in_stride_x;
- *reinterpret_cast<T *>(dst_i.ptr()) = *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
- },
- src_i, offsets_i, dst_i);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
+ const auto in_yi = static_cast<int32_t>(
+ _align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr)
+ : std::floor((id.y() + _sampling_offset) * hr));
+ const int32_t offset_row = in_yi * in_stride_x;
+ *reinterpret_cast<T *>(dst_i.ptr()) =
+ *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
+ },
+ src_i, offsets_i, dst_i);
}
template <typename T>
-void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src,
+ ITensor *dst,
+ const ITensor *dx,
+ const ITensor *dy,
+ const ITensor *offsets,
+ const Window &window)
{
// Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
- Window win_off;
+ const auto hr =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
+ Window win_off;
win_off.set(Window::DimX, window.x());
win_off.set(Window::DimY, window.y());
@@ -332,7 +327,7 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const
win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+ for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
{
win_off.set(d, Window::Dimension(0, 0, 0));
}
@@ -347,7 +342,7 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const
const int32_t in_dim_h = src->info()->dimension(1);
const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right;
- if(_border_mode == BorderMode::CONSTANT)
+ if (_border_mode == BorderMode::CONSTANT)
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -355,52 +350,60 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const
using ConstType = T;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
- const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
- const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr()));
- const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr()));
- const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
- const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value;
- const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value;
- const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
- && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
- && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) :
- const_border_value;
-
- *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- src_i, offsets_i, dx_i, dy_i, dst_i);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+ const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+ const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr()));
+ const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr()));
+ const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+ const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+ ? (*(pixel_row_ptr + index_w + index_h * in_stride_w))
+ : const_border_value;
+ const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+ ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w))
+ : const_border_value;
+ const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+ ? (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w))
+ : const_border_value;
+ const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+ ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w))
+ : const_border_value;
+
+ *reinterpret_cast<T *>(dst_i.ptr()) =
+ static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+ },
+ src_i, offsets_i, dx_i, dy_i, dst_i);
}
- else if(_border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
- const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
- const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr()));
- const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr()));
- const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
- auto clamped_x = utility::clamp<int>(index_w, 0, in_dim_w - 1);
- auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
- auto clamped_y = utility::clamp<int>(index_h, 0, in_dim_h - 1);
- auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
- const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
- const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
- const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
-
- *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- src_i, offsets_i, dx_i, dy_i, dst_i);
+ else if (_border_mode == BorderMode::REPLICATE)
+ {
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+ const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+ const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr()));
+ const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr()));
+ const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+ auto clamped_x = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+ auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+ auto clamped_y = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+ auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+ const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
+ const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
+ const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
+ const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
+
+ *reinterpret_cast<T *>(dst_i.ptr()) =
+ static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+ },
+ src_i, offsets_i, dx_i, dy_i, dst_i);
}
else
{
@@ -408,7 +411,12 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const
}
}
-void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src,
+ ITensor *dst,
+ const ITensor *dx,
+ const ITensor *dy,
+ const ITensor *offsets,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dx, dy, offsets);
using namespace scale_helpers;
@@ -425,50 +433,60 @@ void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const
Iterator src_i(src, win_in);
Iterator dst_i(dst, window);
- const auto wr = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
+ const auto wr =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
+ const auto hr =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
const auto w = src->info()->dimension(0);
const auto h = src->info()->dimension(1);
const size_t in_stride = src->info()->strides_in_bytes()[1];
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
-
- uint8x8_t tmp0 = vdup_n_u8(0);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
-
- uint8x8_t tmp1 = vdup_n_u8(0);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
-
- vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
- },
- src_i, dst_i);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
+
+ uint8x8_t tmp0 = vdup_n_u8(0);
+ tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
+ tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
+ tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
+ tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
+ tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
+ tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
+ tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
+ tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
+
+ uint8x8_t tmp1 = vdup_n_u8(0);
+ tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
+ tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
+ tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
+ tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
+ tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
+ tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
+ tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
+ tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
+
+ vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
+ },
+ src_i, dst_i);
}
template <typename T>
-void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src,
+ ITensor *dst,
+ const ITensor *dx,
+ const ITensor *dy,
+ const ITensor *offsets,
+ const Window &window)
{
// Get data layout and width/height indices
const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
// Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners);
+ const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height),
+ dst->info()->dimension(idx_height), _align_corners);
Window win_off;
win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
@@ -479,7 +497,7 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con
win_in.set(idx_width, Window::Dimension(0, 0, 0));
win_in.set(idx_height, Window::Dimension(0, 0, 0));
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+ for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
{
win_off.set(d, Window::Dimension(0, 0, 0));
}
@@ -495,7 +513,7 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con
const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
- if(_border_mode == BorderMode::CONSTANT)
+ if (_border_mode == BorderMode::CONSTANT)
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -503,62 +521,74 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con
using ConstType = T;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
- const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
-
- const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
- *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- src_i, dst_i);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+ const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+ offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+ const auto dx_val =
+ *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+ const auto dy_val =
+ *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+ const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+ const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+ ? (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h))
+ : const_border_value;
+ const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+ ? (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h))
+ : const_border_value;
+ const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+ ? (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h))
+ : const_border_value;
+ const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+ ? (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h))
+ : const_border_value;
+
+ const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+ const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+ const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+ const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+ *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+ scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+ },
+ src_i, dst_i);
}
- else if(_border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
- auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
- const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
- const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
- const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
- const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
- *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- src_i, dst_i);
+ else if (_border_mode == BorderMode::REPLICATE)
+ {
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+ const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+ offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+ const auto dx_val =
+ *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+ const auto dy_val =
+ *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+ const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+ auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+ auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+ auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+ auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+ const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
+ const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
+ const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
+ const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
+
+ const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+ const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+ const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+ const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+ *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+ scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+ },
+ src_i, dst_i);
}
else
{
@@ -567,8 +597,12 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con
}
#endif // ENABLE_NCHW_KERNELS
-Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
- const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
+Status CpuScaleKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *dx,
+ const ITensorInfo *dy,
+ const ITensorInfo *offsets,
+ ITensorInfo *output,
+ const ScaleKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
return Status{};
@@ -588,13 +622,14 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th
const auto dy = tensors.get_const_tensor(TensorType::ACL_INT_1);
const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2);
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
(this->*_func)(src, dst, dx, dy, offsets, window);
}
else
{
- _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window);
+ _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset,
+ _align_corners, window);
}
}
diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h
index 8102142fc3..38142df021 100644
--- a/src/cpu/kernels/CpuScaleKernel.h
+++ b/src/cpu/kernels/CpuScaleKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_SCALEKERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -39,9 +40,19 @@ class CpuScaleKernel : public ICpuKernel<CpuScaleKernel>
{
private:
/** Scale function to use for the particular function to use */
- using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
- using ScaleKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
- InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
+ using ScaleFunctionPtr = void (CpuScaleKernel::*)(
+ const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
+ using ScaleKernelPtr = std::add_pointer<void(const ITensor *,
+ ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ InterpolationPolicy,
+ BorderMode,
+ PixelValue,
+ float,
+ bool,
+ const Window &)>::type;
public:
CpuScaleKernel() = default;
@@ -59,7 +70,11 @@ public:
* @param[out] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
* @param[in] info @ref ScaleKernelInfo to use for configuration
*/
- void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *dx,
+ const ITensorInfo *dy,
+ const ITensorInfo *offsets,
+ ITensorInfo *dst,
const ScaleKernelInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
@@ -67,11 +82,15 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dx,
+ const ITensorInfo *dy,
+ const ITensorInfo *offsets,
+ ITensorInfo *dst,
const ScaleKernelInfo &info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct ScaleKernel
@@ -89,28 +108,48 @@ private:
*
* @note Used only in case down-sampling.
*/
- void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+ void scale_area_nchw_u8(const ITensor *src,
+ ITensor *dst,
+ const ITensor *dx,
+ const ITensor *dy,
+ const ITensor *offsets,
+ const Window &window);
/** function to perform scale using bilinear interpolation on the given window */
template <typename T>
- void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+ void scale_bilinear_nchw(const ITensor *src,
+ ITensor *dst,
+ const ITensor *dx,
+ const ITensor *dy,
+ const ITensor *offsets,
+ const Window &window);
/** function to perform scale using bilinear interpolation on the given window */
template <typename T>
- void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+ void scale_bilinear_qasymm(const ITensor *src,
+ ITensor *dst,
+ const ITensor *dx,
+ const ITensor *dy,
+ const ITensor *offsets,
+ const Window &window);
/** function to perform scale using nearest neighbour on the given window */
template <typename T>
- void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+ void scale_nearest_nchw(const ITensor *src,
+ ITensor *dst,
+ const ITensor *dx,
+ const ITensor *dy,
+ const ITensor *offsets,
+ const Window &window);
#endif // ENABLE_NCHW_KERNELS
- ScaleFunctionPtr _func{ nullptr };
+ ScaleFunctionPtr _func{nullptr};
InterpolationPolicy _policy{};
BorderMode _border_mode{};
PixelValue _constant_border_value{};
- float _sampling_offset{ 0 };
- bool _align_corners{ false };
- DataLayout _data_layout{ DataLayout::UNKNOWN };
- ScaleKernelPtr _run_method{ nullptr };
+ float _sampling_offset{0};
+ bool _align_corners{false};
+ DataLayout _data_layout{DataLayout::UNKNOWN};
+ ScaleKernelPtr _run_method{nullptr};
std::string _name{};
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index e06ab9917c..ce144351f8 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -30,11 +30,11 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
#include "src/cpu/kernels/softmax/list.h"
namespace arm_compute
@@ -46,61 +46,44 @@ namespace kernels
namespace
{
/* Softmax Logits 1D Max - identifying the max value of 1D Logits */
-static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits =
-{
- {
- "sve_fp32_logits_1d_max",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
- REGISTER_FP32_SVE(sve_fp32_logits)
- },
- {
- "sve_fp16_logits_1d_max",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
- REGISTER_FP16_SVE(sve_fp16_logits)
- },
- {
- "sve_qu8_logits_1d_max",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
- REGISTER_QASYMM8_SVE(sve_qasymm8_logits)
- },
- {
- "sve_qs8_logits_1d_max",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
- REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)
- },
- {
- "neon_fp32_logits_1d_max",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(neon_fp32_logits)
- },
- {
- "neon_fp16_logits_1d_max",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
- REGISTER_FP16_NEON(neon_fp16_logits)
- },
- {
- "neon_qu8_logits_1d_max",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(neon_qasymm8_logits)
- },
- {
- "neon_qs8_logits_1d_max",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)
- },
+static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = {
+ {"sve_fp32_logits_1d_max",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+ REGISTER_FP32_SVE(sve_fp32_logits)},
+ {"sve_fp16_logits_1d_max",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+ REGISTER_FP16_SVE(sve_fp16_logits)},
+ {"sve_qu8_logits_1d_max",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
+ REGISTER_QASYMM8_SVE(sve_qasymm8_logits)},
+ {"sve_qs8_logits_1d_max",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
+ REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)},
+ {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(neon_fp32_logits)},
+ {"neon_fp16_logits_1d_max",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+ REGISTER_FP16_NEON(neon_fp16_logits)},
+ {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(neon_qasymm8_logits)},
+ {"neon_qs8_logits_1d_max",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)},
};
Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
// Validate in case of configured output
- if(output.total_size() != 0)
+ if (output.total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(),
+ TensorShape(input.tensor_shape()).set(0, 1));
}
return Status{};
@@ -121,7 +104,7 @@ void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
// Output auto initialization if not yet initialized
auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
- const auto *uk = get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
_run_method = uk->ukernel;
@@ -158,60 +141,46 @@ const char *CpuLogits1DMaxKernel::name() const
}
/* Softmax Logits 1D - computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG>
-static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits =
-{
- {
- "sve2_qu8_softmax_logits_1d",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
- REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)
- },
- {
- "sve2_qs8_softmax_logits_1d",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
- REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)
- },
- {
- "sve_fp32_softmax_logits_1d",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
- REGISTER_FP32_SVE(sve_fp32_softmax)
- },
- {
- "sve_fp16_softmax_logits_1d",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
- REGISTER_FP16_SVE(sve_fp16_softmax)
- },
-
- {
- "neon_fp32_softmax_logits_1d",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(neon_fp32_softmax)
- },
- {
- "neon_fp16_softmax_logits_1d",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
- REGISTER_FP16_NEON(neon_fp16_softmax)
- },
- {
- "neon_qu8_softmax_logits_1d",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)
- },
- {
- "neon_qs8_softmax_logits_1d",
- [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)
- },
+template <bool IS_LOG>
+static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = {
+ {"sve2_qu8_softmax_logits_1d",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+ REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)},
+ {"sve2_qs8_softmax_logits_1d",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+ REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)},
+ {"sve_fp32_softmax_logits_1d",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+ REGISTER_FP32_SVE(sve_fp32_softmax)},
+ {"sve_fp16_softmax_logits_1d",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+ REGISTER_FP16_SVE(sve_fp16_softmax)},
+
+ {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(neon_fp32_softmax)},
+ {"neon_fp16_softmax_logits_1d",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+ REGISTER_FP16_NEON(neon_fp16_softmax)},
+ {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)},
+ {"neon_qs8_softmax_logits_1d",
+ [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)},
};
namespace
{
-Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
- const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log)
+Status validate_arguments_logits_softmax(const ITensorInfo &src,
+ const ITensorInfo &max,
+ const ITensorInfo &dst,
+ const float beta,
+ const ITensorInfo &tmp,
+ bool is_log)
{
ARM_COMPUTE_UNUSED(beta);
// Check input
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
@@ -221,16 +190,18 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
// Check output if configured
- if(dst.total_size() != 0)
+ if (dst.total_size() != 0)
{
- const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info();
+ const QuantizationInfo output_quantization =
+ is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log)
+ : dst.quantization_info();
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
}
// Check tmp if configured
- if(tmp.total_size() != 0)
+ if (tmp.total_size() != 0)
{
const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
@@ -243,14 +214,16 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
}
} // namespace
-template <bool IS_LOG>
-const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+template <bool IS_LOG>
+const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &
+CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
{
return available_kernels_logits<IS_LOG>;
}
template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
+void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(
+ const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
@@ -259,17 +232,21 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const I
const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
// Output auto initialization if not yet initialized
- const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
+ const QuantizationInfo output_quantization =
+ is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG)
+ : dst->quantization_info();
auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
// Tmp auto initialization if not yet initialized
const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
- const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(
+ DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
- std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+ std::string kernel_name =
+ IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
_beta = beta;
_run_method = uk->ukernel;
@@ -282,8 +259,8 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const I
}
template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max,
- const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
+Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
+ const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
@@ -305,7 +282,7 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window
auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
- const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
+ const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
@@ -314,7 +291,7 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window
}
template <bool IS_LOG>
-const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
+const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
{
return _name.c_str();
}
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 59f43bd1d2..5d288179fd 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -57,7 +57,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct SoftmaxLogits1DMaxKernel
@@ -70,7 +70,7 @@ public:
static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
private:
- SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr };
+ SoftmaxLogits1DMaxKernelPtr _run_method{nullptr};
std::string _name{};
};
@@ -79,7 +79,8 @@ template <bool IS_LOG = false>
class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
{
private:
- using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
+ using SoftmaxLogits1DKernelPtr = std::add_pointer<void(
+ const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
public:
CpuLogits1DSoftmaxKernel() = default;
@@ -95,18 +96,22 @@ public:
*
* @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input.
*/
- void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
+ void
+ configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuLogits1DSoftmaxKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *max,
- const ITensorInfo *dst, const float beta, const ITensorInfo *tmp);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *max,
+ const ITensorInfo *dst,
+ const float beta,
+ const ITensorInfo *tmp);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
struct SoftmaxLogits1DKernel
@@ -119,8 +124,8 @@ public:
static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
private:
- float _beta{ 1.0f };
- SoftmaxLogits1DKernelPtr _run_method{ nullptr };
+ float _beta{1.0f};
+ SoftmaxLogits1DKernelPtr _run_method{nullptr};
std::string _name{};
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index 875d613dca..2b2c6f2e92 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -25,8 +25,9 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/add/generic/neon/impl.h"
@@ -51,70 +52,48 @@ namespace
using CpuSubKernelDataTypeISASelectorData = CpuAddKernelDataTypeISASelectorData;
using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
-static const std::vector<CpuSubKernel::SubKernel> available_kernels =
-{
- {
- "neon_fp32_sub",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
- },
- {
- "neon_fp16_sub",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
- REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
- },
- {
- "neon_u8_sub",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
- },
- {
- "neon_s16_sub",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
- },
- {
- "neon_s32_sub",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
- },
- {
- "neon_qu8_sub_fixedpoint",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)
- },
- {
- "neon_qs8_sub_fixedpoint",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)
- },
- {
- "neon_qu8_sub",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
- },
- {
- "neon_qs8_sub",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
- },
- {
- "neon_qs16_sub",
- [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
- },
+static const std::vector<CpuSubKernel::SubKernel> available_kernels = {
+ {"neon_fp32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)},
+ {"neon_fp16_sub",
+ [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)},
+ {"neon_u8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)},
+ {"neon_s16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)},
+ {"neon_s32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)},
+ {"neon_qu8_sub_fixedpoint",
+ [](const CpuSubKernelDataTypeISASelectorData &data)
+ { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)},
+ {"neon_qs8_sub_fixedpoint",
+ [](const CpuSubKernelDataTypeISASelectorData &data)
+ { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)},
+ {"neon_qu8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)},
+ {"neon_qs8_sub",
+ [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)},
+ {"neon_qs16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)},
};
-inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+inline Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
{
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16,
+ DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
- const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint });
+ const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+ CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
@@ -125,7 +104,7 @@ inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src
"Convert policy cannot be WRAP if datatype is quantized");
// Validate in case of configured dst
- if(dst.total_size() > 0)
+ if (dst.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
@@ -147,7 +126,8 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
set_data_type_if_unknown(*dst, src0->data_type());
const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst);
- const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint });
+ const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+ CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
@@ -167,14 +147,14 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
ARM_COMPUTE_UNUSED(thread_count);
#if defined(ENABLE_FP32_KERNELS)
- if(this->_run_method == &sub_same_neon<float>)
+ if (this->_run_method == &sub_same_neon<float>)
{
size_t mws = ICPPKernel::default_mws;
- if(platform.get_cpu_model() == CPUModel::N1)
+ if (platform.get_cpu_model() == CPUModel::N1)
{
mws = default_mws_N1_fp32_neon;
}
- else if(platform.get_cpu_model() == CPUModel::V1)
+ else if (platform.get_cpu_model() == CPUModel::V1)
{
mws = default_mws_V1_fp32_neon;
}
@@ -184,7 +164,7 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
}
// tensor is 1D or was re-interpreted as 1D
- if(this->window().shape().num_dimensions() == 1)
+ if (this->window().shape().num_dimensions() == 1)
{
return mws;
}
@@ -203,7 +183,8 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
return ICPPKernel::default_mws;
}
-Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+Status
+CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
index cd209d1837..5fa0dc411a 100644
--- a/src/cpu/kernels/CpuSubKernel.h
+++ b/src/cpu/kernels/CpuSubKernel.h
@@ -37,7 +37,8 @@ namespace kernels
class CpuSubKernel : public ICpuKernel<CpuSubKernel>
{
private:
- using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+ using SubKernelPtr = std::add_pointer<void(
+ const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
public:
@@ -68,7 +69,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+ static Status
+ validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -99,9 +101,9 @@ public:
private:
ConvertPolicy _policy{};
- SubKernelPtr _run_method{ nullptr };
+ SubKernelPtr _run_method{nullptr};
std::string _name{};
- size_t _split_dimension{ Window::DimY };
+ size_t _split_dimension{Window::DimY};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp
index b2cebc4230..615bc6ce1e 100644
--- a/src/cpu/kernels/CpuTransposeKernel.cpp
+++ b/src/cpu/kernels/CpuTransposeKernel.cpp
@@ -28,8 +28,9 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -45,7 +46,7 @@ namespace
{
unsigned int num_elems_processed(size_t element_size)
{
- switch(element_size)
+ switch (element_size)
{
case 1:
return 8;
@@ -81,10 +82,10 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
Window window_in(window);
window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
- if(left_over_loop_y)
+ if (left_over_loop_y)
{
// Check if window_end_y_multiple_of is greater than window_start_y
- if(window_end_y_multiple_of > window_start_y)
+ if (window_end_y_multiple_of > window_start_y)
{
window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
}
@@ -101,87 +102,121 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
Iterator output(out, window_out);
// Run the SIMD path if and only if the input is not a row-vector
- if(in->info()->dimension(1) != 1)
+ if (in->info()->dimension(1) != 1)
{
Iterator input(in, window_in);
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- // Compute 8x8 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ window_in,
+ [&](const Coordinates &id)
{
- const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
- const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
- const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
- const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
- const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
- const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
- const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
- const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
-
- // Transpose 2x2
- const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
- const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
- const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
- const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
-
- // Transpose 4x4
- const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
- const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
- const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
- const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
-
- // Transpose 8x8
- const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
- const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
- const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
- const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
- }
-
- // Compute left-over elements along the x dimension (1x8)
- for(; x < window_end_x; ++x)
- {
- const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
- const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
- const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
- const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
- const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
- const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
- const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
- const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
-
- uint8x8_t result = vdup_n_u8(0);
- result = vset_lane_u8(val0, result, 0);
- result = vset_lane_u8(val1, result, 1);
- result = vset_lane_u8(val2, result, 2);
- result = vset_lane_u8(val3, result, 3);
- result = vset_lane_u8(val4, result, 4);
- result = vset_lane_u8(val5, result, 5);
- result = vset_lane_u8(val6, result, 6);
- result = vset_lane_u8(val7, result, 7);
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
- vst1_u8(output.ptr() + dst_offset_in_bytes, result);
- }
- },
- input, output);
+ // Compute 8x8 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x8_t row0 =
+ vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
+ const uint8x8_t row1 =
+ vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
+ const uint8x8_t row2 =
+ vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
+ const uint8x8_t row3 =
+ vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
+ const uint8x8_t row4 =
+ vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
+ const uint8x8_t row5 =
+ vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
+ const uint8x8_t row6 =
+ vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
+ const uint8x8_t row7 =
+ vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
+
+ // Transpose 2x2
+ const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
+ const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
+ const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
+ const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
+
+ // Transpose 4x4
+ const uint16x4x2_t k0_u16 =
+ vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
+ const uint16x4x2_t k1_u16 =
+ vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
+ const uint16x4x2_t k2_u16 =
+ vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
+ const uint16x4x2_t k3_u16 =
+ vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
+
+ // Transpose 8x8
+ const uint32x2x2_t k0_u32 =
+ vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
+ const uint32x2x2_t k1_u32 =
+ vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
+ const uint32x2x2_t k2_u32 =
+ vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
+ const uint32x2x2_t k3_u32 =
+ vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
+
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+
+ vst1_u8(
+ reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+ vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
+ vst1_u8(
+ reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+ vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
+ vst1_u8(
+ reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+ vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
+ vst1_u8(
+ reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+ vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
+ vst1_u8(
+ reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+ vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
+ vst1_u8(
+ reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+ vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
+ vst1_u8(
+ reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+ vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
+ vst1_u8(
+ reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+ vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
+ }
+
+ // Compute left-over elements along the x dimension (1x8)
+ for (; x < window_end_x; ++x)
+ {
+ const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
+ const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
+ const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
+ const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
+ const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
+ const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
+ const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
+ const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
+
+ uint8x8_t result = vdup_n_u8(0);
+ result = vset_lane_u8(val0, result, 0);
+ result = vset_lane_u8(val1, result, 1);
+ result = vset_lane_u8(val2, result, 2);
+ result = vset_lane_u8(val3, result, 3);
+ result = vset_lane_u8(val4, result, 4);
+ result = vset_lane_u8(val5, result, 5);
+ result = vset_lane_u8(val6, result, 6);
+ result = vset_lane_u8(val7, result, 7);
+
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+
+ vst1_u8(output.ptr() + dst_offset_in_bytes, result);
+ }
+ },
+ input, output);
}
- if(left_over_loop_y)
+ if (left_over_loop_y)
{
window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -190,16 +225,18 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
Iterator output(out, window_out);
// Compute left-over elements along the y dimension (1x1)
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- const uint8_t val0 = *input.ptr();
+ execute_window_loop(
+ window_in,
+ [&](const Coordinates &id)
+ {
+ const uint8_t val0 = *input.ptr();
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
- *(output.ptr() + dst_offset_in_bytes) = val0;
- },
- input, output);
+ *(output.ptr() + dst_offset_in_bytes) = val0;
+ },
+ input, output);
}
}
@@ -220,10 +257,10 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
Window window_in(window);
window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
- if(left_over_loop_y)
+ if (left_over_loop_y)
{
// Check if window_end_y_multiple_of is greater than window_start_y
- if(window_end_y_multiple_of > window_start_y)
+ if (window_end_y_multiple_of > window_start_y)
{
window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
}
@@ -240,61 +277,77 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
Iterator output(out, window_out);
// Run the SIMD path if and only if the input is not a row-vector
- if(in->info()->dimension(1) != 1)
+ if (in->info()->dimension(1) != 1)
{
Iterator input(in, window_in);
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- // Compute 4x4 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ window_in,
+ [&](const Coordinates &id)
{
- const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
- // Transpose 2x2
- const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
- const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
-
- // Transpose 4x4
- const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
- const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
- }
-
- // Compute left-over elements (1x4)
- for(; x < window_end_x; ++x)
- {
- const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
- uint16x4_t result = vdup_n_u16(0);
- result = vset_lane_u16(val0, result, 0);
- result = vset_lane_u16(val1, result, 1);
- result = vset_lane_u16(val2, result, 2);
- result = vset_lane_u16(val3, result, 3);
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
- }
- },
- input, output);
+ // Compute 4x4 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint16x4_t row0 =
+ vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+ const uint16x4_t row1 =
+ vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+ const uint16x4_t row2 =
+ vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+ const uint16x4_t row3 =
+ vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+ // Transpose 2x2
+ const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
+ const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
+
+ // Transpose 4x4
+ const uint32x2x2_t k0_u32 =
+ vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
+ const uint32x2x2_t k1_u32 =
+ vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
+
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+
+ vst1_u16(
+ reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+ vreinterpret_u16_u32(k0_u32.val[0]));
+ vst1_u16(
+ reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+ vreinterpret_u16_u32(k1_u32.val[0]));
+ vst1_u16(
+ reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+ vreinterpret_u16_u32(k0_u32.val[1]));
+ vst1_u16(
+ reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+ vreinterpret_u16_u32(k1_u32.val[1]));
+ }
+
+ // Compute left-over elements (1x4)
+ for (; x < window_end_x; ++x)
+ {
+ const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+ const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+ const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+ const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+ uint16x4_t result = vdup_n_u16(0);
+ result = vset_lane_u16(val0, result, 0);
+ result = vset_lane_u16(val1, result, 1);
+ result = vset_lane_u16(val2, result, 2);
+ result = vset_lane_u16(val3, result, 3);
+
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+
+ vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
+ }
+ },
+ input, output);
}
- if(left_over_loop_y)
+ if (left_over_loop_y)
{
window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -303,16 +356,18 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
Iterator output(out, window_out);
// Compute left-over elements along the y dimension (1x1)
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
+ execute_window_loop(
+ window_in,
+ [&](const Coordinates &id)
+ {
+ const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
- *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
- },
- input, output);
+ *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+ },
+ input, output);
}
}
@@ -347,10 +402,10 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
Window window_in(window);
window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
- if(left_over_loop_y)
+ if (left_over_loop_y)
{
// Check if window_end_y_multiple_of is greater than window_start_y
- if(window_end_y_multiple_of > window_start_y)
+ if (window_end_y_multiple_of > window_start_y)
{
window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
}
@@ -367,102 +422,160 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
Iterator output(out, window_out);
// Run the SIMD path if and only if the input is not a row-vector
- if(in->info()->dimension(1) != 1)
+ if (in->info()->dimension(1) != 1)
{
Iterator input(in, window_in);
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- // Compute 8x8 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ window_in,
+ [&](const Coordinates &id)
{
- // Load
- const uint32x4x2_t row0 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint32x4x2_t row1 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint32x4x2_t row2 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint32x4x2_t row3 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
- const uint32x4x2_t row4 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
- const uint32x4x2_t row5 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
- const uint32x4x2_t row6 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
- const uint32x4x2_t row7 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
-
- // Transpose 2x4
- const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), vtrn2q_u32(row0.val[0], row1.val[0])};
- const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), vtrn2q_u32(row0.val[1], row1.val[1])};
- const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), vtrn2q_u32(row2.val[0], row3.val[0])};
- const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), vtrn2q_u32(row2.val[1], row3.val[1])};
- const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), vtrn2q_u32(row4.val[0], row5.val[0])};
- const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), vtrn2q_u32(row4.val[1], row5.val[1])};
- const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), vtrn2q_u32(row6.val[0], row7.val[0])};
- const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), vtrn2q_u32(row6.val[1], row7.val[1])};
-
- // Transpose 2x2
- const uint64x2x2_t k0_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
- const uint64x2x2_t k1_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
- const uint64x2x2_t k2_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
- const uint64x2x2_t k3_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
- const uint64x2x2_t k4_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
- const uint64x2x2_t k5_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
- const uint64x2x2_t k6_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
- const uint64x2x2_t k7_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
-
- // Swap blocks
- const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), vreinterpretq_u32_u64(k4_u64.val[0])};
- const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), vreinterpretq_u32_u64(k5_u64.val[0])};
- const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), vreinterpretq_u32_u64(k4_u64.val[1])};
- const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), vreinterpretq_u32_u64(k5_u64.val[1])};
- const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), vreinterpretq_u32_u64(k6_u64.val[0])};
- const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), vreinterpretq_u32_u64(k7_u64.val[0])};
- const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), vreinterpretq_u32_u64(k6_u64.val[1])};
- const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), vreinterpretq_u32_u64(k7_u64.val[1])};
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
- // Store
- vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), col0);
- vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), col1);
- vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), col2);
- vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), col3);
- vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), col4);
- vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), col5);
- vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), col6);
- vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), col7);
- }
-
- // Compute left-over elements (8x1)
- for(; x < window_end_x; ++x)
- {
- const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
- const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
- const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
- const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
- const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
-
- uint32x4_t result0 = vdupq_n_u32(0);
- uint32x4_t result1 = vdupq_n_u32(0);
- result0 = vsetq_lane_u32(val0, result0, 0);
- result0 = vsetq_lane_u32(val1, result0, 1);
- result0 = vsetq_lane_u32(val2, result0, 2);
- result0 = vsetq_lane_u32(val3, result0, 3);
- result1 = vsetq_lane_u32(val4, result1, 0);
- result1 = vsetq_lane_u32(val5, result1, 1);
- result1 = vsetq_lane_u32(val6, result1, 2);
- result1 = vsetq_lane_u32(val7, result1, 3);
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
- vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
- }
- },
- input, output);
+ // Compute 8x8 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ // Load
+ const uint32x4x2_t row0 =
+ vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+ const uint32x4x2_t row1 =
+ vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+ const uint32x4x2_t row2 =
+ vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+ const uint32x4x2_t row3 =
+ vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+ const uint32x4x2_t row4 =
+ vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+ const uint32x4x2_t row5 =
+ vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+ const uint32x4x2_t row6 =
+ vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+ const uint32x4x2_t row7 =
+ vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+
+ // Transpose 2x4
+ const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]),
+ vtrn2q_u32(row0.val[0], row1.val[0])};
+ const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]),
+ vtrn2q_u32(row0.val[1], row1.val[1])};
+ const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]),
+ vtrn2q_u32(row2.val[0], row3.val[0])};
+ const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]),
+ vtrn2q_u32(row2.val[1], row3.val[1])};
+ const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]),
+ vtrn2q_u32(row4.val[0], row5.val[0])};
+ const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]),
+ vtrn2q_u32(row4.val[1], row5.val[1])};
+ const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]),
+ vtrn2q_u32(row6.val[0], row7.val[0])};
+ const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]),
+ vtrn2q_u32(row6.val[1], row7.val[1])};
+
+ // Transpose 2x2
+ const uint64x2x2_t k0_u64 = {
+ vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])),
+ vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
+ const uint64x2x2_t k1_u64 = {
+ vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])),
+ vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
+ const uint64x2x2_t k2_u64 = {
+ vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])),
+ vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
+ const uint64x2x2_t k3_u64 = {
+ vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])),
+ vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
+ const uint64x2x2_t k4_u64 = {
+ vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])),
+ vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
+ const uint64x2x2_t k5_u64 = {
+ vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])),
+ vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
+ const uint64x2x2_t k6_u64 = {
+ vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])),
+ vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
+ const uint64x2x2_t k7_u64 = {
+ vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])),
+ vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
+
+ // Swap blocks
+ const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]),
+ vreinterpretq_u32_u64(k4_u64.val[0])};
+ const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]),
+ vreinterpretq_u32_u64(k5_u64.val[0])};
+ const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]),
+ vreinterpretq_u32_u64(k4_u64.val[1])};
+ const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]),
+ vreinterpretq_u32_u64(k5_u64.val[1])};
+ const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]),
+ vreinterpretq_u32_u64(k6_u64.val[0])};
+ const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]),
+ vreinterpretq_u32_u64(k7_u64.val[0])};
+ const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]),
+ vreinterpretq_u32_u64(k6_u64.val[1])};
+ const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]),
+ vreinterpretq_u32_u64(k7_u64.val[1])};
+
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+ // Store
+ vst1q_u32_x2_(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+ col0);
+ vst1q_u32_x2_(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+ col1);
+ vst1q_u32_x2_(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+ col2);
+ vst1q_u32_x2_(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+ col3);
+ vst1q_u32_x2_(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+ col4);
+ vst1q_u32_x2_(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+ col5);
+ vst1q_u32_x2_(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+ col6);
+ vst1q_u32_x2_(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+ col7);
+ }
+
+ // Compute left-over elements (8x1)
+ for (; x < window_end_x; ++x)
+ {
+ const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+ const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+ const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+ const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+ const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+ const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+ const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+ const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+
+ uint32x4_t result0 = vdupq_n_u32(0);
+ uint32x4_t result1 = vdupq_n_u32(0);
+ result0 = vsetq_lane_u32(val0, result0, 0);
+ result0 = vsetq_lane_u32(val1, result0, 1);
+ result0 = vsetq_lane_u32(val2, result0, 2);
+ result0 = vsetq_lane_u32(val3, result0, 3);
+ result1 = vsetq_lane_u32(val4, result1, 0);
+ result1 = vsetq_lane_u32(val5, result1, 1);
+ result1 = vsetq_lane_u32(val6, result1, 2);
+ result1 = vsetq_lane_u32(val7, result1, 3);
+
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+ vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
+ }
+ },
+ input, output);
}
- if(left_over_loop_y)
+ if (left_over_loop_y)
{
window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -471,40 +584,42 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
Iterator output(out, window_out);
// Compute left-over elements along the y dimension (1x1)
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+ execute_window_loop(
+ window_in,
+ [&](const Coordinates &id)
+ {
+ const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
- *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
- },
- input, output);
+ *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+ },
+ input, output);
}
}
#else // __aarch64__
void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
{
- const int window_step_x = 4;
- const int window_step_y = 4;
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_start_y = window.y().start();
- const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
- const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
- const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
- const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
+ const int window_step_x = 4;
+ const int window_step_y = 4;
+ const int window_start_x = window.x().start();
+ const int window_end_x = window.x().end();
+ const int window_start_y = window.y().start();
+ const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+ const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+ const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
+ const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
// Check if we need a left-over loop for the y dimension
bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
Window window_in(window);
window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
- if(left_over_loop_y)
+ if (left_over_loop_y)
{
// Check if window_end_y_multiple_of is greater than window_start_y
- if(window_end_y_multiple_of > window_start_y)
+ if (window_end_y_multiple_of > window_start_y)
{
window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
}
@@ -521,60 +636,74 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
Iterator output(out, window_out);
// Run the SIMD path if and only if the input is not a row-vector
- if(in->info()->dimension(1) != 1)
+ if (in->info()->dimension(1) != 1)
{
Iterator input(in, window_in);
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- // Compute 4x4 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
- // Transpose 2x2
- const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
- const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
- const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
- const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
- // Swap block 01 with block 10 and store
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
- }
-
- // Compute left-over elements (1x4)
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ window_in,
+ [&](const Coordinates &id)
{
- const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
- uint32x4_t result = vdupq_n_u32(0);
- result = vsetq_lane_u32(val0, result, 0);
- result = vsetq_lane_u32(val1, result, 1);
- result = vsetq_lane_u32(val2, result, 2);
- result = vsetq_lane_u32(val3, result, 3);
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
- }
- },
- input, output);
+ // Compute 4x4 elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint32x4_t row0 =
+ vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+ const uint32x4_t row1 =
+ vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+ const uint32x4_t row2 =
+ vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+ const uint32x4_t row3 =
+ vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+ // Transpose 2x2
+ const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
+ const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
+ const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
+ const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
+
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+ // Swap block 01 with block 10 and store
+ vst1q_u32(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+ vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
+ vst1q_u32(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+ vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
+ vst1q_u32(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+ vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
+ vst1q_u32(
+ reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+ vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
+ }
+
+ // Compute left-over elements (1x4)
+ for (; x < window_end_x; ++x)
+ {
+ const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+ const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+ const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+ const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+ uint32x4_t result = vdupq_n_u32(0);
+ result = vsetq_lane_u32(val0, result, 0);
+ result = vsetq_lane_u32(val1, result, 1);
+ result = vsetq_lane_u32(val2, result, 2);
+ result = vsetq_lane_u32(val3, result, 3);
+
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+ vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
+ }
+ },
+ input, output);
}
- if(left_over_loop_y)
+ if (left_over_loop_y)
{
window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -583,16 +712,18 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
Iterator output(out, window_out);
// Compute left-over elements along the y dimension (1x1)
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+ execute_window_loop(
+ window_in,
+ [&](const Coordinates &id)
+ {
+ const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+ // Compute destination address
+ const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
- *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
- },
- input, output);
+ *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+ },
+ input, output);
}
}
#endif // __aarch64__
@@ -616,7 +747,8 @@ void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());
// Configure kernel window
- Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ Window win =
+ calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
// The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped
Coordinates coord;
@@ -637,7 +769,7 @@ Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *d
"Element size not supported");
// Validate configured destination
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
@@ -658,7 +790,7 @@ void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cons
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- switch(src->info()->element_size())
+ switch (src->info()->element_size())
{
case 1:
transpose_8bit_elements(src, dst, window);
diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h
index cb85daeb40..e79a405677 100644
--- a/src/cpu/kernels/CpuTransposeKernel.h
+++ b/src/cpu/kernels/CpuTransposeKernel.h
@@ -54,7 +54,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
index 2ccc977995..297ba63826 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -38,7 +39,7 @@ namespace
{
TensorShape get_output_shape(const ITensorInfo *src, bool has_bias)
{
- TensorShape output_shape{ src->tensor_shape() };
+ TensorShape output_shape{src->tensor_shape()};
output_shape.collapse(3);
const size_t tmp_dim = output_shape[0];
@@ -54,20 +55,22 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, con
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
- if(biases != nullptr)
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type()));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1));
ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2));
ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3]));
- ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4]));
+ ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] ||
+ biases->dimension(1) != src->tensor_shape()[4]));
}
// Checks performed when output is configured
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+ get_output_shape(src, biases != nullptr));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
}
@@ -84,9 +87,7 @@ void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInf
auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr))));
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
- biases,
- dst));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst));
// Configure kernel
Window window = calculate_max_window(*src, Steps());
@@ -122,44 +123,47 @@ void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
// Create iterators
Iterator in(src, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Get column index
- const int kernel_idx = id[3];
- const int kernel_idz = id[4];
-
- // Setup pointers
- const uint8_t *tmp_input_ptr = in.ptr();
- uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
- const uint8_t *curr_input_row_ptr = tmp_input_ptr;
- const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
- // Linearize volume
- for(unsigned int d = 0; d < kernel_depth; ++d)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- for(unsigned int j = 0; j < kernel_size_y; ++j)
+ // Get column index
+ const int kernel_idx = id[3];
+ const int kernel_idz = id[4];
+
+ // Setup pointers
+ const uint8_t *tmp_input_ptr = in.ptr();
+ uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+ const uint8_t *curr_input_row_ptr = tmp_input_ptr;
+ const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+ // Linearize volume
+ for (unsigned int d = 0; d < kernel_depth; ++d)
{
- for(unsigned int i = 0; i < kernel_size_x; ++i)
+ for (unsigned int j = 0; j < kernel_size_y; ++j)
{
- std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
- tmp_input_ptr += input_stride_x;
- tmp_output_ptr += output_stride_y;
+ for (unsigned int i = 0; i < kernel_size_x; ++i)
+ {
+ std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
+ tmp_input_ptr += input_stride_x;
+ tmp_output_ptr += output_stride_y;
+ }
+ curr_input_row_ptr += input_stride_y;
+ tmp_input_ptr = curr_input_row_ptr;
}
- curr_input_row_ptr += input_stride_y;
- tmp_input_ptr = curr_input_row_ptr;
+ curr_input_depth_ptr += input_stride_z;
+ curr_input_row_ptr = curr_input_depth_ptr;
+ tmp_input_ptr = curr_input_depth_ptr;
}
- curr_input_depth_ptr += input_stride_z;
- curr_input_row_ptr = curr_input_depth_ptr;
- tmp_input_ptr = curr_input_depth_ptr;
- }
- // Add bias
- if(biases != nullptr)
- {
- std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size());
- }
- },
- in);
+ // Add bias
+ if (biases != nullptr)
+ {
+ std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)),
+ src->info()->element_size());
+ }
+ },
+ in);
}
const char *CpuWeightsReshapeKernel::name() const
{
@@ -167,4 +171,4 @@ const char *CpuWeightsReshapeKernel::name() const
}
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h
index 1a260edc96..9310b3c784 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.h
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h
@@ -82,7 +82,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
};
} // namespace kernels
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
index 818d878119..52e3f2549c 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
@@ -28,8 +28,10 @@ namespace arm_compute
{
namespace cpu
{
-CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads)
- : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads }
+CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+ arm_conv::ConvolutionArgs &_c_args,
+ uint32_t nthreads)
+ : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
{
}
@@ -49,24 +51,20 @@ void CpuWinogradConv2dTransformInputKernel::run_op(ITensorPack &tensors, const W
const size_t input_row_stride = src_strides[height_idx] / element_size_in_bytes;
const size_t input_col_stride = src_strides[width_idx] / element_size_in_bytes;
const size_t input_batch_stride = src_strides[batch_idx] / element_size_in_bytes;
- const auto input_nhwc_ptr = reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
- auto win_transf_ptr = reinterpret_cast<void *>(winograd_input_transform->buffer() + winograd_input_transform->info()->offset_first_element_in_bytes());
+ const auto input_nhwc_ptr =
+ reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
+ auto win_transf_ptr = reinterpret_cast<void *>(winograd_input_transform->buffer() +
+ winograd_input_transform->info()->offset_first_element_in_bytes());
- _winograd_impl.input_transform->execute(
- _conv_args,
- input_nhwc_ptr,
- input_batch_stride,
- input_row_stride,
- input_col_stride,
- win_transf_ptr,
- _winograd_impl.winograd_spec,
- workspace->buffer(),
- info.thread_id,
- _nthreads);
+ _winograd_impl.input_transform->execute(_conv_args, input_nhwc_ptr, input_batch_stride, input_row_stride,
+ input_col_stride, win_transf_ptr, _winograd_impl.winograd_spec,
+ workspace->buffer(), info.thread_id, _nthreads);
}
-CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads)
- : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads }
+CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+ arm_conv::ConvolutionArgs &_c_args,
+ uint32_t nthreads)
+ : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
{
}
@@ -88,28 +86,21 @@ void CpuWinogradConv2dTransformOutputKernel::run_op(ITensorPack &tensors, const
const size_t out_row_stride = dst_strides[height_idx] / element_size_in_bytes;
const size_t out_col_stride = dst_strides[width_idx] / element_size_in_bytes;
const size_t out_batch_stride = dst_strides[batch_idx] / element_size_in_bytes;
- const auto wout_transf_ptr = reinterpret_cast<const void *>(winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes());
- auto dst_nhwc_ptr = reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes());
- void *biases_data_ptr = nullptr;
- if(biases != nullptr)
+ const auto wout_transf_ptr = reinterpret_cast<const void *>(
+ winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes());
+ auto dst_nhwc_ptr =
+ reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes());
+ void *biases_data_ptr = nullptr;
+ if (biases != nullptr)
{
biases_data_ptr = reinterpret_cast<void *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
}
// Output transform
- _winograd_impl.output_transform->execute(
- _conv_args,
- wout_transf_ptr,
- _winograd_impl.winograd_spec,
- biases_data_ptr,
- dst_nhwc_ptr,
- out_batch_stride,
- out_row_stride,
- out_col_stride,
- workspace->buffer(),
- info.thread_id,
- _nthreads);
+ _winograd_impl.output_transform->execute(_conv_args, wout_transf_ptr, _winograd_impl.winograd_spec, biases_data_ptr,
+ dst_nhwc_ptr, out_batch_stride, out_row_stride, out_col_stride,
+ workspace->buffer(), info.thread_id, _nthreads);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h
index 0170dcae22..8a3b745e85 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.h
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h
@@ -30,6 +30,7 @@
#include "arm_compute/core/Steps.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/Tensor.h"
+
#include "src/core/NEON/kernels/assembly/winograd.hpp"
#include "src/core/NEON/kernels/convolution/common/tensor.hpp"
#include "src/cpu/ICpuKernel.h"
@@ -53,7 +54,9 @@ public:
/** Prevent instances of this class from being moved it contains references.*/
CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = delete;
- CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads);
+ CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+ arm_conv::ConvolutionArgs &_c_args,
+ uint32_t nthreads);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -83,7 +86,9 @@ public:
/** Prevent instances of this class from being moved it contains references.*/
CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = delete;
- CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads);
+ CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+ arm_conv::ConvolutionArgs &_c_args,
+ uint32_t nthreads);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -95,7 +100,7 @@ public:
private:
arm_conv::winograd::WinogradImpl &_winograd_impl;
- const arm_conv::ConvolutionArgs &_conv_args;
+ const arm_conv::ConvolutionArgs &_conv_args;
uint32_t _nthreads;
};
diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp
index e51b5b3423..ddc6dc24cd 100644
--- a/src/cpu/kernels/activation/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp
@@ -31,7 +31,7 @@ namespace cpu
{
namespace
{
-constexpr ActFpImplParams Fp16Params = { static_cast<float16_t>(1e-7), 8 };
+constexpr ActFpImplParams Fp16Params = {static_cast<float16_t>(1e-7), 8};
} // namespace
void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
@@ -40,4 +40,4 @@ void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLaye
}
} // namespace cpu
} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp
index 2a3b8a0bfd..e558f8c73e 100644
--- a/src/cpu/kernels/activation/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp
@@ -29,7 +29,7 @@ namespace cpu
{
namespace
{
-constexpr ActFpImplParams Fp32Params = { static_cast<float>(1e-24), 4 };
+constexpr ActFpImplParams Fp32Params = {static_cast<float>(1e-24), 4};
} // namespace
void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h
index 05885d8476..afeb6f7f3d 100644
--- a/src/cpu/kernels/activation/generic/neon/impl.h
+++ b/src/cpu/kernels/activation/generic/neon/impl.h
@@ -24,6 +24,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
@@ -56,10 +57,14 @@ inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &ma
#endif /* __aarch64__ */
template <typename T, const ActFpImplParams &P>
-void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void fp_neon_activation_impl(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
/** SIMD vector tag type. */
- using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+ using ExactTagType =
+ typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
constexpr int window_step_x = P.step_x;
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -72,12 +77,12 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL
// to prevent NAN values caused by zeros in inputs to SQRT.
// In case of aarh64, we call vsqrt directly, so we don't use delta.
#ifndef __aarch64__
- const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType {});
+ const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType{});
#else /* #ifndef __aarch64__ */
- const auto const_inv_2 = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType {});
+ const auto const_inv_2 = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{});
const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{});
#endif /* __aarch64__ */
- const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {});
+ const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
const auto const_6 = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
const auto const_3 = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
@@ -88,143 +93,154 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL
const auto vb = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{});
const auto a = static_cast<T>(act_info.a());
const auto b = static_cast<T>(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
- wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- const auto vin = wrapper::vloadq(input_ptr + x);
- switch(act)
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = wrapper::vabs(vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = wrapper::vmla(vb, va, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = wrapper::vmax(const_0, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ switch (act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = wrapper::vabs(vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = wrapper::vmla(vb, va, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = wrapper::vmax(const_0, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin,
+ wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
+ wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
#ifdef __aarch64__
- tmp = wrapper::vsqrt(vin);
+ tmp = wrapper::vsqrt(vin);
#else /* __aarch64__ */
{
const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
- tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
- tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
+ tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+ tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
}
#endif /* __aarch64__ */
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = wrapper::vmul(vin, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = vin;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
- break;
- case ActivationLayerInfo::ActivationFunction::SWISH:
- tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
- break;
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = wrapper::vmul(vin, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = vin;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = wrapper::vmul(
+ vin,
+ wrapper::vmul(const_inv_6,
+ wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SWISH:
+ tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(
+ const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
+ break;
#ifdef __aarch64__
- case ActivationLayerInfo::ActivationFunction::GELU:
- tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_2, wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
- break;
+ case ActivationLayerInfo::ActivationFunction::GELU:
+ tmp = wrapper::vmul(
+ vin,
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
+ break;
#endif /* __aarch64__ */
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ wrapper::vstore(output_ptr + x, tmp);
}
- wrapper::vstore(output_ptr + x, tmp);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const T in = *(reinterpret_cast<const T *>(input_ptr + x));
- T tmp;
- switch(act)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = std::abs(in);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = a * in + b;
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = std::max<T>(static_cast<T>(0), in);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = std::min<T>(a, std::max<T>(b, in));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = (in > 0) ? in : a * in;
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
- tmp = std::sqrt(in);
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = in * in;
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = a * std::tanh(b * in);
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = in;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
- break;
- case ActivationLayerInfo::ActivationFunction::SWISH:
- tmp = in / (static_cast<T>(1) + std::exp(-a * in));
- break;
- case ActivationLayerInfo::ActivationFunction::GELU:
- tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
+ const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+ T tmp;
+ switch (act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = std::abs(in);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = a * in + b;
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = std::max<T>(static_cast<T>(0), in);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = std::min<T>(a, std::max<T>(b, in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = (in > 0) ? in : a * in;
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
+ tmp = std::sqrt(in);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = in * in;
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = a * std::tanh(b * in);
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = in;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SWISH:
+ tmp = in / (static_cast<T>(1) + std::exp(-a * in));
+ break;
+ case ActivationLayerInfo::ActivationFunction::GELU:
+ tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ *(output_ptr + x) = tmp;
}
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp
index c973e964e4..f289c80d4b 100644
--- a/src/cpu/kernels/activation/generic/neon/lut.cpp
+++ b/src/cpu/kernels/activation/generic/neon/lut.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/kernels/lut/list.h"
namespace arm_compute
@@ -33,19 +34,22 @@ namespace cpu
#ifdef __aarch64__
void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
- ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
+ src->info()->data_type() != DataType::QASYMM8_SIGNED);
const auto window_end_x = window.x().end();
Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
- auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
- lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
- },
- input, output);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+ },
+ input, output);
}
#endif // __aarch64__
} // namespace cpu
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index e7c146e46f..1451301ea2 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
@@ -38,7 +39,10 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
constexpr int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -85,206 +89,222 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
float32x4_t vs = vdupq_n_f32(s);
float32x4_t vo = vdupq_n_f32(o);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
- wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
+ wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = vmaxq_u8(vconst_0, vin);
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- // Perform activation
- tmp = vminq_u8(va, vmaxq_u8(vb, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
- }
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ // Perform activation
+ tmp = vmaxq_u8(vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
{
- {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize(tmp_dep, qi_out);
- }
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
+ }
#endif // __aarch64__
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
{
- {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize(tmp_dep, qi_out);
- }
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
+ }
#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
{
- {
- wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
- wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
- wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
- wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize(tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- const auto vin_deq = vdequantize(vin, qi_in);
-
- const uint32x4x4_t pos_mask =
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vmul(
+ vin_deq.val[0],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[1],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[2],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[3],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
{
- {
+ const auto vin_deq = vdequantize(vin, qi_in);
+
+ const uint32x4x4_t pos_mask = {{
wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
- }
- };
+ }};
- const float32x4x4_t tmp_dep =
- {
- {
+ const float32x4x4_t tmp_dep = {{
wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
- }
- };
+ }};
- tmp = vquantize(tmp_dep, qi_out);
- }
+ tmp = vquantize(tmp_dep, qi_out);
+ }
#else // #ifndef __aarch64__
- else if (act == ActivationLayerInfo::ActivationFunction::GELU)
- {
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
+ else if (act == ActivationLayerInfo::ActivationFunction::GELU)
{
- {
- wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[0], const_inv_sqrt_2))))),
- wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[1], const_inv_sqrt_2))))),
- wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[2], const_inv_sqrt_2))))),
- wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[3], const_inv_sqrt_2))))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize(tmp_dep, qi_out);
- }
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vmul(vin_deq.val[0],
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+ vin_deq.val[0], const_inv_sqrt_2))))),
+ wrapper::vmul(vin_deq.val[1],
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+ vin_deq.val[1], const_inv_sqrt_2))))),
+ wrapper::vmul(vin_deq.val[2],
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+ vin_deq.val[2], const_inv_sqrt_2))))),
+ wrapper::vmul(vin_deq.val[3],
+ wrapper::vmul(const_inv_2,
+ wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+ vin_deq.val[3], const_inv_sqrt_2))))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
+ }
#endif // __aarch64__
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ wrapper::vstore(output_ptr + x, tmp);
}
- wrapper::vstore(output_ptr + x, tmp);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
- qasymm8_t tmp = 0;
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- tmp = std::max(const_0, in);
- tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- tmp = std::min(a, std::max(const_0, in));
- tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- tmp = std::min(a, std::max(b, in));
- tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
- }
+ qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+ qasymm8_t tmp = 0;
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ tmp = std::max(const_0, in);
+ tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(const_0, in));
+ tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(b, in));
+ tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+ }
#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = 1.f / (1.f + std::exp(-tmp_f));
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
#endif // __aarch64__
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::GELU)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
#endif // __aarch64__
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ *(output_ptr + x) = tmp;
}
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
index 52c396459b..a2f588245a 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
@@ -36,7 +37,10 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_signed_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
constexpr int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -76,191 +80,195 @@ void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
float32x4_t vs = vdupq_n_f32(s);
float32x4_t vo = vdupq_n_f32(o);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
- wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
+ wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = vmaxq_s8(vconst_0, vin);
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- // Perform activation
- tmp = vminq_s8(va, vmaxq_s8(vb, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
- }
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ // Perform activation
+ tmp = vmaxq_s8(vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = vminq_s8(va, vmaxq_s8(vb, vin));
+ // Re-quantize to new output space
+ tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+ }
#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
{
- {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_signed(tmp_dep, qi_out);
- }
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_signed(tmp_dep, qi_out);
+ }
#endif // __aarch64__
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
{
- {
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_signed(tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_signed(tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
{
- {
- wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
- wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
- wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
- wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_signed(tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- const auto vin_deq = vdequantize(vin, qi_in);
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep = {{
+ wrapper::vmul(
+ vin_deq.val[0],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[1],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[2],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+ wrapper::vmul(
+ vin_deq.val[3],
+ wrapper::vmul(
+ const_inv_6_f32,
+ wrapper::vmin(const_6_f32,
+ wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_signed(tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ const auto vin_deq = vdequantize(vin, qi_in);
#ifdef __aarch64__
- const uint32x4x4_t pos_mask =
- {
- {
+ const uint32x4x4_t pos_mask = {{
wrapper::vcgtz(vin_deq.val[0]),
wrapper::vcgtz(vin_deq.val[1]),
wrapper::vcgtz(vin_deq.val[2]),
wrapper::vcgtz(vin_deq.val[3]),
- }
- };
+ }};
#else // __aarch64__
- const uint32x4x4_t pos_mask =
- {
- {
+ const uint32x4x4_t pos_mask = {{
wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
- }
- };
+ }};
#endif // __aarch64__
- const float32x4x4_t tmp_dep =
- {
- {
+ const float32x4x4_t tmp_dep = {{
wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
- }
- };
+ }};
- tmp = vquantize_signed(tmp_dep, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
+ tmp = vquantize_signed(tmp_dep, qi_out);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ wrapper::vstore(output_ptr + x, tmp);
}
- wrapper::vstore(output_ptr + x, tmp);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
- qasymm8_signed_t tmp = 0;
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- tmp = std::max(const_0, in);
- tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- tmp = std::min(a, std::max(const_0, in));
- tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- tmp = std::min(a, std::max(b, in));
- tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
- }
+ qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+ qasymm8_signed_t tmp = 0;
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ tmp = std::max(const_0, in);
+ tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(const_0, in));
+ tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ tmp = std::min(a, std::max(b, in));
+ tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+ }
#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- tmp_f = 1.f / (1.f + std::exp(-tmp_f));
- tmp = quantize_qasymm8_signed(tmp_f, qi_out);
- }
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+ tmp = quantize_qasymm8_signed(tmp_f, qi_out);
+ }
#endif // __aarch64__
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
- tmp = quantize_qasymm8_signed(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
- tmp = quantize_qasymm8_signed(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
- tmp = quantize_qasymm8_signed(tmp_f, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
+ tmp = quantize_qasymm8_signed(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ {
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+ tmp = quantize_qasymm8_signed(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+ tmp = quantize_qasymm8_signed(tmp_f, qi_out);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ *(output_ptr + x) = tmp;
}
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
index 2aea6cba3c..891646ea00 100644
--- a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
@@ -21,11 +21,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/NESymm.h"
#include "src/core/NEON/wrapper/wrapper.h"
@@ -38,7 +39,10 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qsymm16_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
constexpr int window_step_x = 8;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -59,103 +63,94 @@ void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL
const float a_f32 = act_info.a();
const float b_f32 = act_info.b();
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const qsymm16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const qsymm16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
- wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
- ARM_COMPUTE_UNUSED(tmp);
+ wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
+ ARM_COMPUTE_UNUSED(tmp);
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- // De-quantize
- const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
- // Perform activation
- const float32x4x2_t tmp_dep =
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
{
- {
+ // De-quantize
+ const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+ // Perform activation
+ const float32x4x2_t tmp_dep = {{
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_int16(tmp_dep, qi_out.scale);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
- // Perform activation
- const float32x4x2_t tmp_dep =
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_int16(tmp_dep, qi_out.scale);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
{
- {
+ // De-quantize
+ const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+ // Perform activation
+ const float32x4x2_t tmp_dep = {{
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_int16(tmp_dep, qi_out.scale);
- }
+ }};
+ // Re-quantize to new output space
+ tmp = vquantize_int16(tmp_dep, qi_out.scale);
+ }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // De-quantize
- const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
- // Perform activation
- const float32x4x2_t tmp_dep =
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
{
- {
- wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
- wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_int16(tmp_dep, qi_out.scale);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
+ // De-quantize
+ const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+ // Perform activation
+ const float32x4x2_t tmp_dep = {{wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
+ wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))}};
+ // Re-quantize to new output space
+ tmp = vquantize_int16(tmp_dep, qi_out.scale);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ wrapper::vstore(output_ptr + x, tmp);
}
- wrapper::vstore(output_ptr + x, tmp);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- qsymm16_t in = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
- qsymm16_t tmp = 0;
- if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- float tmp_f = dequantize_qsymm16(in, qi_in.scale);
- tmp_f = 1.f / (1.f + std::exp(-tmp_f));
- tmp = quantize_qsymm16(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- float tmp_f = dequantize_qsymm16(in, qi_in.scale);
- tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
- tmp = quantize_qsymm16(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- float tmp_f = dequantize_qsymm16(in, qi_in.scale);
- tmp_f = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
- tmp = quantize_qsymm16(tmp_f, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
+ qsymm16_t in = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
+ qsymm16_t tmp = 0;
+ if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+ tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+ tmp = quantize_qsymm16(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+ tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
+ tmp = quantize_qsymm16(tmp_f, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+ tmp_f = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
+ tmp = quantize_qsymm16(tmp_f, qi_out);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ *(output_ptr + x) = tmp;
}
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
index 4757c60d8f..97399e01e0 100644
--- a/src/cpu/kernels/activation/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp
@@ -29,11 +29,11 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
-#include <cmath>
-#include <cstddef>
-
#include "src/core/NEON/SVEMath.h"
+
#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
namespace arm_compute
{
@@ -59,77 +59,87 @@ void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayer
const auto va = svdup_n_f16(act_info.a());
const auto vb = svdup_n_f16(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
- svfloat16_t tmp;
+ svfloat16_t tmp;
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
- {
- const auto vin = svld1_f16(pg, input_ptr + x);
- switch(act)
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
{
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = svabs_f16_z(pg, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = svmla_f16_z(pg, vb, va, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = svmax_f16_z(pg, const_0, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
- tmp = svsqrt_f16_z(pg, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = svmul_f16_z(pg, vin, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = vin;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
- break;
- case ActivationLayerInfo::ActivationFunction::SWISH:
- tmp = svmul_f16_z(pg, vin, svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- svst1_f16(pg, output_ptr + x, tmp);
+ const auto vin = svld1_f16(pg, input_ptr + x);
+ switch (act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = svabs_f16_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = svmla_f16_z(pg, vb, va, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = svmax_f16_z(pg, const_0, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va),
+ svmax_f16_z(pg, vin, const_0));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin,
+ svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
+ tmp = svsqrt_f16_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = svmul_f16_z(pg, vin, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = vin;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = svmul_f16_z(
+ pg, vin,
+ svmul_f16_z(
+ pg, const_inv_6,
+ svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SWISH:
+ tmp = svmul_f16_z(
+ pg, vin,
+ svinv_f16_z(pg, svadd_f16_z(pg, const_1,
+ svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ svst1_f16(pg, output_ptr + x, tmp);
- x += svcnth();
- pg = svwhilelt_b16(x, window_end_x);
+ x += svcnth();
+ pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- input, output);
+ } while (svptest_any(svptrue_b16(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp
index 87f04c255a..d1b075d52c 100644
--- a/src/cpu/kernels/activation/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp
@@ -26,13 +26,13 @@
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
#include <cmath>
#include <cstddef>
-#include <arm_sve.h>
-
namespace arm_compute
{
namespace cpu
@@ -58,78 +58,89 @@ void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayer
const auto va = svdup_n_f32(act_info.a());
const auto vb = svdup_n_f32(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(output.ptr());
- svfloat32_t tmp;
+ svfloat32_t tmp;
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b32(x, window_end_x);
- do
- {
- const auto vin = svld1_f32(pg, input_ptr + x);
- switch(act)
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b32(x, window_end_x);
+ do
{
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = svabs_f32_z(pg, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = svmla_f32_z(pg, vb, va, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = svmax_f32_z(pg, const_0, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin, svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
- tmp = svsqrt_f32_z(pg, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = svmul_f32_z(pg, vin, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = vin;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
- break;
- case ActivationLayerInfo::ActivationFunction::SWISH:
- tmp = svmul_f32_z(pg, vin, svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- svst1_f32(pg, output_ptr + x, tmp);
+ const auto vin = svld1_f32(pg, input_ptr + x);
+ switch (act)
+ {
+ case ActivationLayerInfo::ActivationFunction::ABS:
+ tmp = svabs_f32_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LINEAR:
+ tmp = svmla_f32_z(pg, vb, va, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ tmp = svmax_f32_z(pg, const_0, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
+ break;
+ case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+ tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va),
+ svmax_f32_z(pg, vin, const_0));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+ tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin,
+ svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::ELU:
+ tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin,
+ svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQRT:
+ tmp = svsqrt_f32_z(pg, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::SQUARE:
+ tmp = svmul_f32_z(pg, vin, vin);
+ break;
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
+ break;
+ case ActivationLayerInfo::ActivationFunction::IDENTITY:
+ tmp = vin;
+ break;
+ case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+ tmp = svmul_f32_z(
+ pg, vin,
+ svmul_f32_z(
+ pg, const_inv_6,
+ svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
+ break;
+ case ActivationLayerInfo::ActivationFunction::SWISH:
+ tmp = svmul_f32_z(
+ pg, vin,
+ svinv_f32_z(pg, svadd_f32_z(pg, const_1,
+ svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
+ svst1_f32(pg, output_ptr + x, tmp);
- x += svcntw();
- pg = svwhilelt_b32(x, window_end_x);
+ x += svcntw();
+ pg = svwhilelt_b32(x, window_end_x);
- }
- while(svptest_any(svptrue_b32(), pg));
- },
- input, output);
+ } while (svptest_any(svptrue_b32(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp
index d65de8d649..2ed667debf 100644
--- a/src/cpu/kernels/activation/generic/sve2/lut.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/kernels/lut/list.h"
namespace arm_compute
@@ -33,19 +34,22 @@ namespace cpu
#ifdef __aarch64__
void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
{
- ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
+ src->info()->data_type() != DataType::QASYMM8_SIGNED);
const auto window_end_x = window.x().end();
Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = input.ptr();
- auto output_ptr = output.ptr();
- lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
- },
- input, output);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = input.ptr();
+ auto output_ptr = output.ptr();
+ lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+ },
+ input, output);
}
#endif // __aarch64__
} // namespace cpu
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
index bc9bc7aa3c..7efa9e4b72 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
@@ -26,18 +26,21 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
-#include <cmath>
-#include <cstddef>
-
#include "src/core/NEON/SVEAsymm.h"
#include "src/core/NEON/SVEMath.h"
+
#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
namespace arm_compute
{
namespace cpu
{
-void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -61,7 +64,7 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
// Initialise scale/offset for re-quantization
bool requant = true;
- if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+ if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
{
requant = false;
}
@@ -78,139 +81,160 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
const auto vo_s32 = svdup_n_s32(o_s32);
// Initialise scale/offset for re-quantization for leaky relu
- int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
- arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+ arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
- svuint8_t tmp;
+ svuint8_t tmp;
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- const auto vin = svld1_u8(pg, input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = svmax_u8_z(pg, vconst_0, vin);
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
- // Re-quantize to new output space
- tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
-
- // Re-quantize to new output space
- tmp = svquantize_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
{
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
-
- // Re-quantize to new output space
- tmp = svquantize_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- svbool_t p0, p1, p2, p3;
- svint32x4_t tmp_dep;
-
- // Expand to int32
- const svint32x4_t vin_s32 = svcreate4_s32(
- svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
- svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
- svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
- svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
-
- // Compare elements to input offset
- if(qi_in.scale >= 0)
+ const auto vin = svld1_u8(pg, input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
{
- p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ // Perform activation
+ tmp = svmax_u8_z(pg, vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
}
- else
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
{
- p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ // Perform activation
+ tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
}
-
- // Multiply negative elements and requantize if necessary
- if(requant)
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // Perform activation
+ tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+ // Re-quantize to new output space
+ tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+
+ // Re-quantize to new output space
+ tmp = svquantize_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
{
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+
+ // Re-quantize to new output space
+ tmp = svquantize_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ svbool_t p0, p1, p2, p3;
+ svint32x4_t tmp_dep;
+
+ // Expand to int32
+ const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+ svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+ svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+ svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
+
+ // Compare elements to input offset
+ if (qi_in.scale >= 0)
+ {
+ p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+ else
+ {
+ p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+
+ // Multiply negative elements and requantize if necessary
+ if (requant)
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+ svsel(p0, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+ svsel(p1, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+ svsel(p2, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+ svsel(p3, vs_leaky_s32, vs_s32)),
+ 8));
+ }
+ else
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ }
+
+ // Convert uint32 vectors to uint16 vectors (with saturation)
+ const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+ const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+ // convert uint16 vectors to uint8 vectors (with saturation)
+ tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
}
else
{
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ ARM_COMPUTE_ERROR("Unsupported activation function");
}
- // Convert uint32 vectors to uint16 vectors (with saturation)
- const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
- const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
- // convert uint16 vectors to uint8 vectors (with saturation)
- tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
-
- svst1_u8(pg, output_ptr + x, tmp);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
+ svst1_u8(pg, output_ptr + x, tmp);
- }
- while(svptest_any(svptrue_b8(), pg));
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
- },
- input, output);
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
index d20684f54d..e4667522dd 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
@@ -24,20 +24,23 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <cmath>
-#include <cstddef>
#include "src/core/NEON/SVEAsymm.h"
#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
namespace arm_compute
{
namespace cpu
{
-void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_signed_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -65,7 +68,7 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
// Initialise scale/offset for re-quantization
bool requant = true;
- if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+ if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
{
requant = false;
}
@@ -82,151 +85,190 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
const auto vo_s32 = svdup_n_s32(o_s32);
// Initialise scale/offset for re-quantization for leaky relu
- int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
- arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+ int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+ arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
- svint8_t tmp;
+ svint8_t tmp;
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- const auto vin = svld1_s8(pg, input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = svmax_s8_z(pg, vconst_0, vin);
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
- // Re-quantize to new output space
- tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
- // Re-quantize to new output space
- tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep = svcreate4_f32(
- svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))));
- // Re-quantize to new output space
- tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
{
- svbool_t p0, p1, p2, p3;
- svint32x4_t tmp_dep;
-
- // Expand to int32
- const svint32x4_t vin_s32 = svcreate4_s32(
- svmovlb_s32(svmovlb_s16(vin)),
- svmovlt_s32(svmovlb_s16(vin)),
- svmovlb_s32(svmovlt_s16(vin)),
- svmovlt_s32(svmovlt_s16(vin)));
-
- // Compare elements to input offset
- if(qi_in.scale >= 0)
+ const auto vin = svld1_s8(pg, input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::RELU)
{
- p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ // Perform activation
+ tmp = svmax_s8_z(pg, vconst_0, vin);
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
}
- else
+ else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
{
- p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ // Perform activation
+ tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
}
-
- // Multiply negative elements and requantize if necessary
- if(requant)
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
{
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+ // Perform activation
+ tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+ // Re-quantize to new output space
+ tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
}
- else
+ else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
{
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ // De-quantize
+ const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+ // Perform activation
+ const svfloat32x4_t tmp_dep = svcreate4_f32(
+ svmul_f32_z(pg, svget4_f32(vin_deq, 0),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 0),
+ const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 1),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 1),
+ const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 2),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 2),
+ const_3_f32))))),
+ svmul_f32_z(pg, svget4_f32(vin_deq, 3),
+ svmul_f32_z(pg, const_inv_6_f32,
+ svmin_f32_z(pg, const_6_f32,
+ svmax_f32_z(pg, const_0_f32,
+ svadd_f32_z(pg, svget4_f32(vin_deq, 3),
+ const_3_f32))))));
+ // Re-quantize to new output space
+ tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
}
+ else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ svbool_t p0, p1, p2, p3;
+ svint32x4_t tmp_dep;
- // Convert uint32 vectors to uint16 vectors (with saturation)
- const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
- const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+ // Expand to int32
+ const svint32x4_t vin_s32 =
+ svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)),
+ svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin)));
- // convert uint16 vectors to uint8 vectors (with saturation)
- tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
+ // Compare elements to input offset
+ if (qi_in.scale >= 0)
+ {
+ p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+ else
+ {
+ p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+ p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+ p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+ p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+ }
+
+ // Multiply negative elements and requantize if necessary
+ if (requant)
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+ svsel(p0, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+ svsel(p1, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+ svsel(p2, vs_leaky_s32, vs_s32)),
+ 8),
+ svasr_n_s32_m(pg,
+ svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+ svsel(p3, vs_leaky_s32, vs_s32)),
+ 8));
+ }
+ else
+ {
+ tmp_dep = svcreate4_s32(
+ svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+ svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+ }
+
+ // Convert uint32 vectors to uint16 vectors (with saturation)
+ const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+ const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+ // convert uint16 vectors to uint8 vectors (with saturation)
+ tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
- svst1_s8(pg, output_ptr + x, tmp);
+ svst1_s8(pg, output_ptr + x, tmp);
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- input, output);
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
index 5154fac8a7..f955893307 100644
--- a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
@@ -21,24 +21,27 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
-#include <cmath>
-#include <cstddef>
-
#include "src/core/NEON/SVEMath.h"
#include "src/core/NEON/SVESymm.h"
+
#include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
namespace arm_compute
{
namespace cpu
{
-void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qsymm16_activation(const ITensor *src,
+ ITensor *dst,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -56,62 +59,70 @@ void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL
const auto va_f32 = svdup_n_f32(act_info.a());
const auto vb_f32 = svdup_n_f32(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
- svint16_t tmp;
+ svint16_t tmp;
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
- {
- const auto vin = svld1_s16(pg, input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
- // Perform activation
- const svfloat32x2_t tmp_dep = svcreate2_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
- // Re-quantize to new output space
- tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
- // Perform activation
- const svfloat32x2_t tmp_dep = svcreate2_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
- // Re-quantize to new output space
- tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // De-quantize
- auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
- // Perform activation
- const svfloat32x2_t tmp_dep = svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
- svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
- // Re-quantize to new output space
- tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
- }
- else
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
{
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
+ const auto vin = svld1_s16(pg, input_ptr + x);
+ if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ // De-quantize
+ auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+ // Perform activation
+ const svfloat32x2_t tmp_dep = svcreate2_f32(
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
+ svdiv_f32_z(
+ pg, vconst_1,
+ svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
+ // Re-quantize to new output space
+ tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+ {
+ // De-quantize
+ auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+ // Perform activation
+ const svfloat32x2_t tmp_dep = svcreate2_f32(
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
+ svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
+ // Re-quantize to new output space
+ tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+ }
+ else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ // De-quantize
+ auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+ // Perform activation
+ const svfloat32x2_t tmp_dep =
+ svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
+ svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
+ // Re-quantize to new output space
+ tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported activation function");
+ }
- svst1_s16(pg, output_ptr + x, tmp);
+ svst1_s16(pg, output_ptr + x, tmp);
- x += svcnth();
- pg = svwhilelt_b16(x, window_end_x);
+ x += svcnth();
+ pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- input, output);
+ } while (svptest_any(svptrue_b16(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp
index fca7b2cd9f..e7679c14e3 100644
--- a/src/cpu/kernels/add/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/add/generic/neon/fp16.cpp
@@ -30,10 +30,11 @@ namespace arm_compute
{
namespace cpu
{
-void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp16_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_neon<float16_t>(src0, src1, dst, policy, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp
index 1f599b1968..11a970bef4 100644
--- a/src/cpu/kernels/add/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/add/generic/neon/fp32.cpp
@@ -28,9 +28,10 @@ namespace arm_compute
{
namespace cpu
{
-void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp32_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_neon<float>(src0, src1, dst, policy, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp
index 2dde13544a..34938cc4c4 100644
--- a/src/cpu/kernels/add/generic/neon/impl.cpp
+++ b/src/cpu/kernels/add/generic/neon/impl.cpp
@@ -23,8 +23,10 @@
*/
#include "src/cpu/kernels/add/generic/neon/impl.h"
+
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
@@ -40,7 +42,10 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, true);
}
-bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition)
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ bool is_addition)
{
const auto iq0 = src0->quantization_info().uniform();
const auto iq1 = src1->quantization_info().uniform();
@@ -49,7 +54,7 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI
const auto scale0 = iq0.scale / oq.scale;
const auto scale1 = iq1.scale / oq.scale;
- if(scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f)
+ if (scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f)
{
// The scale factor cannot be stored as 5.11 signed fixed-point number.
return false;
@@ -57,9 +62,10 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI
const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset);
- const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset)) : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset));
+ const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset))
+ : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset));
- if(max_acc > 1048575.f) // 2^20 - 1
+ if (max_acc > 1048575.f) // 2^20 - 1
{
// It might not be possible to store the result as 21.11 signed fixed-point number.
return false;
@@ -69,13 +75,19 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI
}
template <typename ScalarType>
-void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_q8_neon_fixedpoint(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
add_sub_q8_neon_fixedpoint<ScalarType>(src0, src1, dst, policy, window, true /*is_addition*/);
}
template <typename ScalarType>
-void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_q8_neon_fixedpoint(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition)
{
ARM_COMPUTE_UNUSED(policy);
@@ -103,7 +115,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
const auto oq_info = dst->info()->quantization_info().uniform();
const auto in0_scale = iq0_info.scale / oq_info.scale;
const auto in1_scale = is_addition ? (iq1_info.scale / oq_info.scale) : (-(iq1_info.scale / oq_info.scale));
- const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
+ const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
constexpr float _2pow11 = 2048;
const auto in0_scale_5p11 = static_cast<int16_t>(support::cpp11::lround(in0_scale * _2pow11));
@@ -112,7 +124,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
constexpr uint8_t shift_amount_remainder = 3;
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
// Prefix: a = non-broadcast, b = broadcast.
@@ -138,68 +150,75 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
Iterator out_it(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
- const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
- const auto b_val = *b_ptr;
- const auto b_scaled = b_scale * b_val;
- const auto b_scaled_21p11 = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11));
- const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11;
- const auto b_vscaled_offseted_21p11 = wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag());
+ win,
+ [&](const Coordinates &)
+ {
+ const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+ const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
+ const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+ const auto b_val = *b_ptr;
+ const auto b_scaled = b_scale * b_val;
+ const auto b_scaled_21p11 = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11));
+ const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11;
+ const auto b_vscaled_offseted_21p11 =
+ wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag());
#ifndef __aarch64__
- const auto b_scaled_offseted = b_scaled + offset;
+ const auto b_scaled_offseted = b_scaled + offset;
#endif // __aarch64__
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Load the input.
- const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
-
- // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
- const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
- const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
-
- // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
- // Widen and store the result in 32-bit integer.
- const auto vout_21p11_00 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11);
- const auto vout_21p11_01 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11);
- const auto vout_21p11_10 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11);
- const auto vout_21p11_11 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11);
-
- // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
- const auto vout_8p8_0 = wrapper::vcombine(
- wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
- wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
- const auto vout_8p8_1 = wrapper::vcombine(
- wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
- wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
-
- // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
- const auto vout_8p0 = wrapper::vcombine(
- wrapper::vqrshrn<8>(vout_8p8_0),
- wrapper::vqrshrn<8>(vout_8p8_1));
-
- // Store the result.
- wrapper::vstore(out_ptr + x, vout_8p0);
- }
-
- // Process the left-over elements.
- for(; x < window_end_x; ++x)
- {
+ int x = window_start_x;
+
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ // Load the input.
+ const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+
+ // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+ const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+ const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+
+ // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
+ // Widen and store the result in 32-bit integer.
+ const auto vout_21p11_00 =
+ wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11);
+ const auto vout_21p11_01 =
+ wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11);
+ const auto vout_21p11_10 =
+ wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11);
+ const auto vout_21p11_11 =
+ wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11);
+
+ // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+ const auto vout_8p8_0 =
+ wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+ wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+ const auto vout_8p8_1 =
+ wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+ wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+
+ // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+ const auto vout_8p0 =
+ wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
+
+ // Store the result.
+ wrapper::vstore(out_ptr + x, vout_8p0);
+ }
+
+ // Process the left-over elements.
+ for (; x < window_end_x; ++x)
+ {
#ifdef __aarch64__
- out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11));
+ out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+ int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11));
#else // __aarch64__
- out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
+ out_ptr[x] = utility::clamp<int, ScalarType>(
+ support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
#endif // __aarch64__
- }
- },
- b_input_it, a_input_it, out_it);
+ }
+ },
+ b_input_it, a_input_it, out_it);
}
else
{
@@ -216,70 +235,85 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
Iterator out_it(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
- const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Load the inputs.
- const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
- const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
-
- // Widen the input elements to signed 16-bit regardless of the input signedness.
- const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
- const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
- const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
- const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
-
- // Multiply the input elements by the scale factor and add the offset.
- // Widen and store the result in 32-bit integer.
- const auto vscaled0_offseted_21p11_00 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11);
- const auto vscaled0_offseted_21p11_01 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11);
- const auto vscaled0_offseted_21p11_10 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11);
- const auto vscaled0_offseted_21p11_11 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11);
-
- const auto vout_21p11_00 = wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11);
- const auto vout_21p11_01 = wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11);
- const auto vout_21p11_10 = wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11);
- const auto vout_21p11_11 = wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11);
-
- // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
- const auto vout_8p8_0 = wrapper::vcombine(
- wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
- wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
- const auto vout_8p8_1 = wrapper::vcombine(
- wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
- wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
-
- // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
- const auto vout_8p0 = wrapper::vcombine(
- wrapper::vqrshrn<8>(vout_8p8_0),
- wrapper::vqrshrn<8>(vout_8p8_1));
-
- // Store the result.
- wrapper::vstore(out_ptr + x, vout_8p0);
- }
-
- // Process the left-over elements.
- for(; x < window_end_x; ++x)
+ win,
+ [&](const Coordinates &)
{
+ const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+ const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+ const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+ int x = window_start_x;
+
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ // Load the inputs.
+ const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+ const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+
+ // Widen the input elements to signed 16-bit regardless of the input signedness.
+ const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+ const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+ const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+ const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+
+ // Multiply the input elements by the scale factor and add the offset.
+ // Widen and store the result in 32-bit integer.
+ const auto vscaled0_offseted_21p11_00 =
+ wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11);
+ const auto vscaled0_offseted_21p11_01 =
+ wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11);
+ const auto vscaled0_offseted_21p11_10 =
+ wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11);
+ const auto vscaled0_offseted_21p11_11 =
+ wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11);
+
+ const auto vout_21p11_00 =
+ wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11);
+ const auto vout_21p11_01 =
+ wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11);
+ const auto vout_21p11_10 =
+ wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11);
+ const auto vout_21p11_11 =
+ wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11);
+
+ // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+ const auto vout_8p8_0 =
+ wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+ wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+ const auto vout_8p8_1 =
+ wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+ wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+
+ // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+ const auto vout_8p0 =
+ wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
+
+ // Store the result.
+ wrapper::vstore(out_ptr + x, vout_8p0);
+ }
+
+ // Process the left-over elements.
+ for (; x < window_end_x; ++x)
+ {
#ifdef __aarch64__
- out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11));
+ out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+ int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11));
#else // __aarch64__
- out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
+ out_ptr[x] = utility::clamp<int, ScalarType>(
+ support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
#endif // __aarch64__
- }
- },
- in0_it, in1_it, out_it);
+ }
+ },
+ in0_it, in1_it, out_it);
}
}
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_qasymm8_neon(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition)
{
ARM_COMPUTE_UNUSED(policy);
@@ -304,7 +338,7 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst
const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -324,63 +358,64 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst
Iterator output(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = non_broadcast_input.ptr();
- const auto output_ptr = output.ptr();
-
- const auto broadcast_value = *broadcast_input.ptr();
- const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
- const auto bfs = float(broadcast_value) * bf_scale + offset;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
+ const auto non_broadcast_input_ptr = non_broadcast_input.ptr();
+ const auto output_ptr = output.ptr();
+
+ const auto broadcast_value = *broadcast_input.ptr();
+ const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+ const auto bfs = float(broadcast_value) * bf_scale + offset;
- const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
- const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
- const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
- const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
- const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
- const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+ const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+ const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
- int32x4_t rf_0{};
- int32x4_t rf_1{};
- int32x4_t rf_2{};
- int32x4_t rf_3{};
+ const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+ const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+ const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+ const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+
+ int32x4_t rf_0{};
+ int32x4_t rf_1{};
+ int32x4_t rf_2{};
+ int32x4_t rf_3{};
#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(af_0);
- rf_1 = vcvtnq_s32_f32(af_1);
- rf_2 = vcvtnq_s32_f32(af_2);
- rf_3 = vcvtnq_s32_f32(af_3);
+ rf_0 = vcvtnq_s32_f32(af_0);
+ rf_1 = vcvtnq_s32_f32(af_1);
+ rf_2 = vcvtnq_s32_f32(af_2);
+ rf_3 = vcvtnq_s32_f32(af_3);
#else //__aarch64__
- rf_0 = vcvtq_s32_f32(af_0);
- rf_1 = vcvtq_s32_f32(af_1);
- rf_2 = vcvtq_s32_f32(af_2);
- rf_3 = vcvtq_s32_f32(af_3);
+ rf_0 = vcvtq_s32_f32(af_0);
+ rf_1 = vcvtq_s32_f32(af_1);
+ rf_2 = vcvtq_s32_f32(af_2);
+ rf_3 = vcvtq_s32_f32(af_3);
#endif //__aarch64__
- const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
- const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
- vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
- }
+ const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+ const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+ vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
#ifdef __aarch64__
- output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+ output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
#else // __aarch64__
- output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+ output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
#endif // __aarch64__
- }
- },
- broadcast_input, non_broadcast_input, output);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -397,72 +432,78 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst
const auto voffset = vdupq_n_f32(offset);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = input1.ptr();
- const auto input2_ptr = input2.ptr();
- const auto output_ptr = output.ptr();
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t a = vld1q_u8(input1_ptr + x);
- const uint8x16_t b = vld1q_u8(input2_ptr + x);
-
- const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
- const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
- const auto b_u16_0 = vmovl_u8(vget_low_u8(b));
- const auto b_u16_1 = vmovl_u8(vget_high_u8(b));
-
- const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
- const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
- const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
- const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
-
- const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2);
- const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2);
- const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2);
- const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
- int32x4_t rf_2{};
- int32x4_t rf_3{};
+ const auto input1_ptr = input1.ptr();
+ const auto input2_ptr = input2.ptr();
+ const auto output_ptr = output.ptr();
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t a = vld1q_u8(input1_ptr + x);
+ const uint8x16_t b = vld1q_u8(input2_ptr + x);
+
+ const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+ const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+ const auto b_u16_0 = vmovl_u8(vget_low_u8(b));
+ const auto b_u16_1 = vmovl_u8(vget_high_u8(b));
+
+ const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+ const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+ const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+ const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+
+ const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2);
+ const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2);
+ const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2);
+ const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2);
+
+ int32x4_t rf_0{};
+ int32x4_t rf_1{};
+ int32x4_t rf_2{};
+ int32x4_t rf_3{};
#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(bf_0);
- rf_1 = vcvtnq_s32_f32(bf_1);
- rf_2 = vcvtnq_s32_f32(bf_2);
- rf_3 = vcvtnq_s32_f32(bf_3);
+ rf_0 = vcvtnq_s32_f32(bf_0);
+ rf_1 = vcvtnq_s32_f32(bf_1);
+ rf_2 = vcvtnq_s32_f32(bf_2);
+ rf_3 = vcvtnq_s32_f32(bf_3);
#else //__aarch64__
- rf_0 = vcvtq_s32_f32(bf_0);
- rf_1 = vcvtq_s32_f32(bf_1);
- rf_2 = vcvtq_s32_f32(bf_2);
- rf_3 = vcvtq_s32_f32(bf_3);
+ rf_0 = vcvtq_s32_f32(bf_0);
+ rf_1 = vcvtq_s32_f32(bf_1);
+ rf_2 = vcvtq_s32_f32(bf_2);
+ rf_3 = vcvtq_s32_f32(bf_3);
#endif //__aarch64__
- const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
- const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
- vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
- }
+ const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+ const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+ vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
#ifdef __aarch64__
- output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+ output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
#else // __aarch64__
- output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+ output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
#endif // __aarch64__
- }
- },
- input1, input2, output);
+ }
+ },
+ input1, input2, output);
}
}
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_qasymm8_signed_neon(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition)
{
ARM_COMPUTE_UNUSED(policy);
@@ -487,7 +528,7 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens
const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -507,63 +548,64 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens
Iterator output(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
- const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
- const auto bfs = float(broadcast_value) * bf_scale + offset;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
+ const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
- const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
- const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+ const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+ const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+ const auto bfs = float(broadcast_value) * bf_scale + offset;
- const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
- const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
- const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
- const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
- int32x4_t rf_0{};
- int32x4_t rf_1{};
- int32x4_t rf_2{};
- int32x4_t rf_3{};
+ const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+ const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+
+ const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+ const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+ const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+ const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+
+ int32x4_t rf_0{};
+ int32x4_t rf_1{};
+ int32x4_t rf_2{};
+ int32x4_t rf_3{};
#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(af_0);
- rf_1 = vcvtnq_s32_f32(af_1);
- rf_2 = vcvtnq_s32_f32(af_2);
- rf_3 = vcvtnq_s32_f32(af_3);
+ rf_0 = vcvtnq_s32_f32(af_0);
+ rf_1 = vcvtnq_s32_f32(af_1);
+ rf_2 = vcvtnq_s32_f32(af_2);
+ rf_3 = vcvtnq_s32_f32(af_3);
#else //__aarch64__
- rf_0 = vcvtq_s32_f32(af_0);
- rf_1 = vcvtq_s32_f32(af_1);
- rf_2 = vcvtq_s32_f32(af_2);
- rf_3 = vcvtq_s32_f32(af_3);
+ rf_0 = vcvtq_s32_f32(af_0);
+ rf_1 = vcvtq_s32_f32(af_1);
+ rf_2 = vcvtq_s32_f32(af_2);
+ rf_3 = vcvtq_s32_f32(af_3);
#endif //__aarch64__
- const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
- const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
- vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
- }
+ const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+ const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+ vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
#ifdef __aarch64__
- output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+ output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
#else // __aarch64__
- output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+ output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
#endif // __aarch64__
- }
- },
- broadcast_input, non_broadcast_input, output);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -580,79 +622,102 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens
const auto voffset = vdupq_n_f32(offset);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const int8x16_t a = vld1q_s8(input1_ptr + x);
- const int8x16_t b = vld1q_s8(input2_ptr + x);
-
- const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
- const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
- const auto b_s16_0 = vmovl_s8(vget_low_s8(b));
- const auto b_s16_1 = vmovl_s8(vget_high_s8(b));
-
- const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
- const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
- const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
- const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
-
- const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2);
- const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2);
- const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2);
- const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
- int32x4_t rf_2{};
- int32x4_t rf_3{};
+ const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const int8x16_t a = vld1q_s8(input1_ptr + x);
+ const int8x16_t b = vld1q_s8(input2_ptr + x);
+
+ const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+ const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+ const auto b_s16_0 = vmovl_s8(vget_low_s8(b));
+ const auto b_s16_1 = vmovl_s8(vget_high_s8(b));
+
+ const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+ const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+ const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+ const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+
+ const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2);
+ const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2);
+ const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2);
+ const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2);
+
+ int32x4_t rf_0{};
+ int32x4_t rf_1{};
+ int32x4_t rf_2{};
+ int32x4_t rf_3{};
#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(bf_0);
- rf_1 = vcvtnq_s32_f32(bf_1);
- rf_2 = vcvtnq_s32_f32(bf_2);
- rf_3 = vcvtnq_s32_f32(bf_3);
+ rf_0 = vcvtnq_s32_f32(bf_0);
+ rf_1 = vcvtnq_s32_f32(bf_1);
+ rf_2 = vcvtnq_s32_f32(bf_2);
+ rf_3 = vcvtnq_s32_f32(bf_3);
#else //__aarch64__
- rf_0 = vcvtq_s32_f32(bf_0);
- rf_1 = vcvtq_s32_f32(bf_1);
- rf_2 = vcvtq_s32_f32(bf_2);
- rf_3 = vcvtq_s32_f32(bf_3);
+ rf_0 = vcvtq_s32_f32(bf_0);
+ rf_1 = vcvtq_s32_f32(bf_1);
+ rf_2 = vcvtq_s32_f32(bf_2);
+ rf_3 = vcvtq_s32_f32(bf_3);
#endif //__aarch64__
- const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
- const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
- vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
- }
+ const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+ const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+ vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
#ifdef __aarch64__
- output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+ output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
#else // __aarch64__
- output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+ output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
#endif // __aarch64__
- }
- },
- input1, input2, output);
+ }
+ },
+ input1, input2, output);
}
}
-template void add_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-
-template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+template void add_q8_neon_fixedpoint<int8_t>(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_q8_neon_fixedpoint<uint8_t>(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+
+template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition);
+template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition);
+
+void add_sub_qasymm8_neon(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition);
+void add_sub_qasymm8_signed_neon(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h
index fb786c5bc1..faa99baffe 100644
--- a/src/cpu/kernels/add/generic/neon/impl.h
+++ b/src/cpu/kernels/add/generic/neon/impl.h
@@ -26,8 +26,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/Window.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -35,7 +36,8 @@ namespace arm_compute
namespace cpu
{
template <typename ScalarType>
-void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_same_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
@@ -53,7 +55,7 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
const auto window_end_x = static_cast<int>(window.x().end());
const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -69,31 +71,36 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
Iterator output(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
- const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ win,
+ [&](const Coordinates &)
{
- const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
- const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
- }
- },
- broadcast_input, non_broadcast_input, output);
+ const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+ const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+ const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+ const auto res = (policy == ConvertPolicy::SATURATE)
+ ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v)
+ : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
+ wrapper::vstore(output_ptr + x, res);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+ *(output_ptr + x) = (policy == ConvertPolicy::SATURATE)
+ ? wrapper::add_sat(broadcast_value, non_broadcast_v)
+ : broadcast_value + non_broadcast_v;
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -106,31 +113,34 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
Iterator output(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto val1 = wrapper::vloadq(input1_ptr + x);
- const auto val2 = wrapper::vloadq(input2_ptr + x);
- const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ win,
+ [&](const Coordinates &)
{
- const auto val1 = *(input1_ptr + x);
- const auto val2 = *(input2_ptr + x);
- *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
- }
- },
- input1, input2, output);
+ const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto val1 = wrapper::vloadq(input1_ptr + x);
+ const auto val2 = wrapper::vloadq(input2_ptr + x);
+ const auto res =
+ (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
+ wrapper::vstore(output_ptr + x, res);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto val1 = *(input1_ptr + x);
+ const auto val2 = *(input2_ptr + x);
+ *(output_ptr + x) =
+ (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
+ }
+ },
+ input1, input2, output);
}
}
@@ -138,17 +148,36 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition);
-
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ bool is_addition);
+
+void add_sub_qasymm8_neon(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition);
+
+void add_sub_qasymm8_signed_neon(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition);
template <typename ScalarType>
-void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+void add_q8_neon_fixedpoint(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
template <typename ScalarType>
-void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+void add_sub_q8_neon_fixedpoint(const ITensor *src0,
+ const ITensor *src1,
+ ITensor *dst,
+ const ConvertPolicy &policy,
+ const Window &window,
+ bool is_addition);
} // namespace cpu
} // namespace arm_compute
#endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H
diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp
index 5698d6d552..f0bcebc9d2 100644
--- a/src/cpu/kernels/add/generic/neon/integer.cpp
+++ b/src/cpu/kernels/add/generic/neon/integer.cpp
@@ -28,19 +28,22 @@ namespace arm_compute
{
namespace cpu
{
-void add_u8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_u8_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_neon<uint8_t>(src0, src1, dst, policy, window);
}
-void add_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s16_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_neon<int16_t>(src0, src1, dst, policy, window);
}
-void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s32_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_neon<int32_t>(src0, src1, dst, policy, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qasymm8.cpp b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
index 69cca956c8..8195d229d9 100644
--- a/src/cpu/kernels/add/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
@@ -23,15 +23,17 @@
*/
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
+
#include "src/cpu/kernels/add/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
add_sub_qasymm8_neon(src0, src1, dst, policy, window, true /*is_addition*/);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
index dfdf8fe85b..7e23096239 100644
--- a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
@@ -23,15 +23,17 @@
*/
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
+
#include "src/cpu/kernels/add/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_signed_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, true /*is_addition*/);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qsymm16.cpp b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
index e76e408d6e..ac2de0557a 100644
--- a/src/cpu/kernels/add/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
@@ -25,14 +25,16 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
namespace arm_compute
{
namespace cpu
{
-void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qsymm16_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
ARM_COMPUTE_UNUSED(policy);
@@ -57,7 +59,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -65,7 +67,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+ const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
// Clear X Dimension on execution window as we handle manually
non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -74,48 +76,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
- const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
- const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
- const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
- const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
- const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
- const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
+ const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+ const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+ const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+
+ const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
+ const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
+ const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
+ const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+ const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+
+ int32x4_t rf_0{};
+ int32x4_t rf_1{};
#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+ rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+ rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
#else //__aarch64__
- rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+ rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+ rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
#endif //__aarch64__
- const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
- vst1q_s16(output_ptr + x, pa);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
- *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
- }
- },
- broadcast_input, non_broadcast_input, output);
+ const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+ vst1q_s16(output_ptr + x, pa);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+ *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -127,48 +131,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
Iterator input2(src1, input2_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int16x8_t a = vld1q_s16(input1_ptr + x);
- const int16x8_t b = vld1q_s16(input2_ptr + x);
-
- const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
- const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
- const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
- const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
+ const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const int16x8_t a = vld1q_s16(input1_ptr + x);
+ const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+ const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+ const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+ const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
+ const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
+
+ int32x4_t rf_0{};
+ int32x4_t rf_1{};
#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+ rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+ rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
#else //__aarch64__
- rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+ rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+ rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
#endif //__aarch64__
- const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
- vst1q_s16(output_ptr + x, pa);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
- const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
- *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
- }
- },
- input1, input2, output);
+ const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+ vst1q_s16(output_ptr + x, pa);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+ const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+ *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
+ }
+ },
+ input1, input2, output);
}
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/fp16.cpp b/src/cpu/kernels/add/generic/sve/fp16.cpp
index 581f3abded..01dfe6c44b 100644
--- a/src/cpu/kernels/add/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/add/generic/sve/fp16.cpp
@@ -31,10 +31,11 @@ namespace arm_compute
{
namespace cpu
{
-void add_fp16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp16_sve(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_sve<float16_t>(src0, src1, dst, policy, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/sve/fp32.cpp b/src/cpu/kernels/add/generic/sve/fp32.cpp
index b37799113a..56771a5411 100644
--- a/src/cpu/kernels/add/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/add/generic/sve/fp32.cpp
@@ -24,15 +24,17 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+
#include "src/cpu/kernels/add/generic/sve/impl.h"
namespace arm_compute
{
namespace cpu
{
-void add_fp32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp32_sve(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_sve<float>(src0, src1, dst, policy, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/impl.cpp b/src/cpu/kernels/add/generic/sve/impl.cpp
index e8606436fd..ca850fcef4 100644
--- a/src/cpu/kernels/add/generic/sve/impl.cpp
+++ b/src/cpu/kernels/add/generic/sve/impl.cpp
@@ -23,17 +23,21 @@
*/
#include "src/cpu/kernels/add/generic/sve/impl.h"
+
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/NEON/SVEMath.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include <arm_sve.h>
namespace arm_compute
{
namespace cpu
{
template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_same_sve(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
const auto all_true_pg = wrapper::svptrue<ScalarType>();
const auto window_start_x = static_cast<int>(window.x().start());
@@ -53,7 +57,7 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
Iterator output(dst, window);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -68,28 +72,30 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value);
+ const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+ const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value);
- int x = window_start_x;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
- {
- const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
- auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
- svst1(pg, output_ptr + x, res);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
+ int x = window_start_x;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ do
+ {
+ const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
+ auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v)
+ : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
+ svst1(pg, output_ptr + x, res);
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -101,35 +107,41 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
Iterator input2(src1, input2_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
- int x = window_start_x;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto val1 = svld1(pg, input1_ptr + x);
- const auto val2 = svld1(pg, input2_ptr + x);
- const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
- svst1(pg, output_ptr + x, res);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
+ const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+ int x = window_start_x;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ do
+ {
+ const auto val1 = svld1(pg, input1_ptr + x);
+ const auto val2 = svld1(pg, input2_ptr + x);
+ const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
+ svst1(pg, output_ptr + x, res);
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ input1, input2, output);
}
}
-template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float>(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<uint8_t>(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int16_t>(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int32_t>(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float16_t>(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/impl.h b/src/cpu/kernels/add/generic/sve/impl.h
index 0136f14246..6a95d66826 100644
--- a/src/cpu/kernels/add/generic/sve/impl.h
+++ b/src/cpu/kernels/add/generic/sve/impl.h
@@ -33,7 +33,8 @@ namespace arm_compute
namespace cpu
{
template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+void add_same_sve(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
} // namespace cpu
} // namespace arm_compute
#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
diff --git a/src/cpu/kernels/add/generic/sve/integer.cpp b/src/cpu/kernels/add/generic/sve/integer.cpp
index 3642dccd7b..4d17f2adbd 100644
--- a/src/cpu/kernels/add/generic/sve/integer.cpp
+++ b/src/cpu/kernels/add/generic/sve/integer.cpp
@@ -24,25 +24,29 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+
#include "src/cpu/kernels/add/generic/sve/impl.h"
namespace arm_compute
{
namespace cpu
{
-void add_u8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_u8_sve(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_sve<uint8_t>(src0, src1, dst, policy, window);
}
-void add_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s16_sve(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_sve<int16_t>(src0, src1, dst, policy, window);
}
-void add_s32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s32_sve(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
return add_same_sve<int32_t>(src0, src1, dst, policy, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
index 1dec214aa0..40add9d51b 100644
--- a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
@@ -26,15 +26,18 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/NEON/SVEMath.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include <arm_sve.h>
namespace arm_compute
{
namespace cpu
{
-void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_sve2(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
ARM_COMPUTE_UNUSED(policy);
@@ -58,7 +61,7 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
const auto voffseto = svdup_n_f32(oq_info.offset);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -78,48 +81,89 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
- const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
- const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
+ const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+ const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
- const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
- const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
+ const auto bf_0 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(
+ pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))),
+ voffset2)),
+ vscale2);
+ const auto bf_1 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(
+ pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))),
+ voffset2)),
+ vscale2);
+ const auto bf_2 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(
+ pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))),
+ voffset2)),
+ vscale2);
+ const auto bf_3 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(
+ pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))),
+ voffset2)),
+ vscale2);
- do
- {
- const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
-
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
- const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
- const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
- const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
- const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
- const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
- const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
- const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-
- const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
- svst1_u8(pg, output_ptr + x, res);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
+ do
+ {
+ const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
+
+ const auto af_0 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+ vscale1);
+ const auto af_1 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+ vscale1);
+ const auto af_2 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+ vscale1);
+ const auto af_3 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+ vscale1);
+
+ const auto rf_0 =
+ svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+ const auto rf_1 =
+ svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+ const auto rf_2 =
+ svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+ const auto rf_3 =
+ svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+ const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+ const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+
+ const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+ svst1_u8(pg, output_ptr + x, res);
+
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -136,45 +180,82 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
const auto voffset1 = svdup_n_s32(iq1_info.offset);
const auto voffset2 = svdup_n_s32(iq2_info.offset);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto a = svld1_u8(pg, input1_ptr + x);
- const auto b = svld1_u8(pg, input2_ptr + x);
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
- const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
- const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2);
- const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2);
- const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2);
-
- const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
- const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
- const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
- const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
- const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
- const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-
- svst1_u8(pg, output_ptr + x, res);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
+ const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
+ {
+ const auto a = svld1_u8(pg, input1_ptr + x);
+ const auto b = svld1_u8(pg, input2_ptr + x);
+ const auto af_0 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+ vscale1);
+ const auto af_1 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+ vscale1);
+ const auto af_2 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+ vscale1);
+ const auto af_3 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+ vscale1);
+
+ const auto bf_0 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)),
+ vscale2);
+ const auto bf_1 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)),
+ vscale2);
+ const auto bf_2 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)),
+ vscale2);
+ const auto bf_3 = svmul_f32_z(
+ pg,
+ svcvt_f32_s32_z(pg,
+ svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)),
+ vscale2);
+
+ const auto rf_0 =
+ svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+ const auto rf_1 =
+ svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+ const auto rf_2 =
+ svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+ const auto rf_3 =
+ svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+ const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+ const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+ const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+
+ svst1_u8(pg, output_ptr + x, res);
+
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ input1, input2, output);
}
}
} // namespace cpu
diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
index dae8899753..2e585115e1 100644
--- a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
@@ -26,15 +26,18 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/NEON/SVEMath.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include <arm_sve.h>
namespace arm_compute
{
namespace cpu
{
-void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_signed_sve2(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
ARM_COMPUTE_UNUSED(policy);
@@ -57,7 +60,7 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *
const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
const auto voffseto = svdup_n_f32(oq_info.offset);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -78,46 +81,63 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
- const auto broadcast_value_vec = svdup_n_s8(broadcast_value);
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
- const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
- const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto a = svld1_s8(pg, non_broadcast_input_ptr + x);
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
- const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
- const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
- const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
- const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
- const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
- const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
- const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
- const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
- svst1_s8(pg, output_ptr + x, res);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
+ const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+ const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+ const auto broadcast_value_vec = svdup_n_s8(broadcast_value);
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ const auto bf_0 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+ vscale2);
+ const auto bf_1 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+ vscale2);
+ const auto bf_2 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+ vscale2);
+ const auto bf_3 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+ vscale2);
+
+ do
+ {
+ const auto a = svld1_s8(pg, non_broadcast_input_ptr + x);
+ const auto af_0 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+ const auto af_1 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+ const auto af_2 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+ const auto af_3 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+
+ const auto rf_0 =
+ svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+ const auto rf_1 =
+ svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+ const auto rf_2 =
+ svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+ const auto rf_3 =
+ svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+ const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+ const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+ const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+
+ svst1_s8(pg, output_ptr + x, res);
+
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -134,46 +154,59 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *
const auto voffset1 = svdup_n_s32(iq1_info.offset);
const auto voffset2 = svdup_n_s32(iq2_info.offset);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto a = svld1_s8(pg, input1_ptr + x);
- const auto b = svld1_s8(pg, input2_ptr + x);
-
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
- const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
- const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
- const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
- const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
-
- const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
- const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
- const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
- const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
- const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
- const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
- svst1_s8(pg, output_ptr + x, res);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- input1, input2, output);
+ const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
+ {
+ const auto a = svld1_s8(pg, input1_ptr + x);
+ const auto b = svld1_s8(pg, input2_ptr + x);
+
+ const auto af_0 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+ const auto af_1 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+ const auto af_2 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+ const auto af_3 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+
+ const auto bf_0 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
+ const auto bf_1 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
+ const auto bf_2 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
+ const auto bf_3 = svmul_f32_z(
+ pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
+
+ const auto rf_0 =
+ svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+ const auto rf_1 =
+ svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+ const auto rf_2 =
+ svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+ const auto rf_3 =
+ svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+ const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+ const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+ const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+
+ svst1_s8(pg, output_ptr + x, res);
+
+ x += svcntb();
+ pg = svwhilelt_b8(x, window_end_x);
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ input1, input2, output);
}
}
} // namespace cpu
diff --git a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
index 8c48ded942..17a42c2138 100644
--- a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
@@ -26,15 +26,18 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/NEON/SVEMath.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include <arm_sve.h>
namespace arm_compute
{
namespace cpu
{
-void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qsymm16_sve2(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
ARM_COMPUTE_UNUSED(policy);
@@ -59,7 +62,7 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
const auto all_true_pg = svptrue_b16();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -74,39 +77,40 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
- const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
- const auto broadcast_value_vec = svdup_n_s16(broadcast_value);
+ const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+ const auto broadcast_value_vec = svdup_n_s16(broadcast_value);
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
+ const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
+ const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
- do
- {
- const auto a = svld1_s16(pg, non_broadcast_input_ptr + x);
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+ do
+ {
+ const auto a = svld1_s16(pg, non_broadcast_input_ptr + x);
+ const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+ const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
- const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+ const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+ const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
- const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+ const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
- svst1_s16(pg, output_ptr + x, res);
+ svst1_s16(pg, output_ptr + x, res);
- x += svcnth();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
+ x += svcnth();
+ pg = svwhilelt_b16(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -118,37 +122,38 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
Iterator input2(src1, input2_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- auto a = svld1_s16(pg, input1_ptr + x);
- auto b = svld1_s16(pg, input2_ptr + x);
-
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
-
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
-
- const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-
- const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
- svst1_s16(pg, output_ptr + x, res);
-
- x += svcnth();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
+ const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
+ {
+ auto a = svld1_s16(pg, input1_ptr + x);
+ auto b = svld1_s16(pg, input2_ptr + x);
+
+ const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+ const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+
+ const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
+ const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
+
+ const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+ const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+
+ const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+ svst1_s16(pg, output_ptr + x, res);
+
+ x += svcnth();
+ pg = svwhilelt_b16(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ input1, input2, output);
}
}
} // namespace cpu
diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h
index 7cdb70fd9e..1040c39a41 100644
--- a/src/cpu/kernels/add/list.h
+++ b/src/cpu/kernels/add/list.h
@@ -31,8 +31,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_ADD_KERNEL(func_name) \
- void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+#define DECLARE_ADD_KERNEL(func_name) \
+ void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+ const Window &window)
DECLARE_ADD_KERNEL(add_qasymm8_neon);
DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
@@ -55,4 +56,4 @@ DECLARE_ADD_KERNEL(add_qsymm16_sve2);
} // namespace cpu
} // namespace arm_compute
-#endif // SRC_CORE_KERNELS_ADD_LIST_H \ No newline at end of file
+#endif // SRC_CORE_KERNELS_ADD_LIST_H
diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
index d8e5f694a8..b4b81aa78b 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/CpuTypes.h"
#include <cstddef>
@@ -38,16 +39,20 @@ namespace
{
using arm_compute::float16_t;
-void a64_add_bn_clamp_direct_fp16_2x32(
- float16_t *out, size_t out_stride,
- float16_t *out_direct, size_t out_direct_stride,
- const float16_t *in0, size_t in0_stride,
- const float16_t *in1, size_t in1_stride,
- const float16_t *bn_mul,
- const float16_t *bn_add,
- const float16_t minval,
- const float16_t maxval,
- size_t width, size_t height)
+void a64_add_bn_clamp_direct_fp16_2x32(float16_t *out,
+ size_t out_stride,
+ float16_t *out_direct,
+ size_t out_direct_stride,
+ const float16_t *in0,
+ size_t in0_stride,
+ const float16_t *in1,
+ size_t in1_stride,
+ const float16_t *bn_mul,
+ const float16_t *bn_add,
+ const float16_t minval,
+ const float16_t maxval,
+ size_t width,
+ size_t height)
{
struct KernelArgs
{
@@ -858,9 +863,14 @@ void a64_add_bn_clamp_direct_fp16_2x32(
"subs x20, x20, #0x2\n"
"bgt 8b\n"
"58:" // odd columns skip
- : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
- : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+ : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+ [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+ : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+ [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+ [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+ "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
}
} // namespace
@@ -869,8 +879,15 @@ namespace arm_compute
{
namespace cpu
{
-void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
- ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_fp16_neon(const ITensor *input1,
+ const ITensor *input2,
+ const ITensor *bn_mul,
+ const ITensor *bn_add,
+ ITensor *add_output,
+ ITensor *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(policy);
@@ -882,16 +899,16 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I
float16_t minval = std::numeric_limits<half>::lowest();
float16_t maxval = std::numeric_limits<half>::max();
- if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+ if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
{
minval = static_cast<float16_t>(0.f);
}
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
{
minval = static_cast<float16_t>(0.f);
maxval = static_cast<float16_t>(act_info.a());
}
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
{
minval = static_cast<float16_t>(act_info.b());
maxval = static_cast<float16_t>(act_info.a());
@@ -909,42 +926,37 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I
const size_t width = window.num_iterations(0);
const size_t height = window.num_iterations(1);
- if(add_output != nullptr)
+ if (add_output != nullptr)
{
Iterator add_out_it(add_output, window);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- a64_add_bn_clamp_direct_fp16_2x32(
- reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
- reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
- reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
- reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
- reinterpret_cast<float16_t *>(bn_mul->buffer()),
- reinterpret_cast<float16_t *>(bn_add->buffer()),
- minval,
- maxval,
- width, height);
- },
- in1_it, in2_it, add_out_it, out_it);
+ win,
+ [&](const Coordinates &)
+ {
+ a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
+ reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
+ reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
+ reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+ reinterpret_cast<float16_t *>(bn_mul->buffer()),
+ reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+ width, height);
+ },
+ in1_it, in2_it, add_out_it, out_it);
}
else
{
execute_window_loop(
- win, [&](const Coordinates &)
- {
- a64_add_bn_clamp_direct_fp16_2x32(
- reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
- nullptr, out_direct_stride,
- reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
- reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
- reinterpret_cast<float16_t *>(bn_mul->buffer()),
- reinterpret_cast<float16_t *>(bn_add->buffer()),
- minval,
- maxval,
- width, height);
- },
- in1_it, in2_it, out_it);
+ win,
+ [&](const Coordinates &)
+ {
+ a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, nullptr,
+ out_direct_stride, reinterpret_cast<float16_t *>(in1_it.ptr()),
+ in0_stride, reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+ reinterpret_cast<float16_t *>(bn_mul->buffer()),
+ reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+ width, height);
+ },
+ in1_it, in2_it, out_it);
}
}
} // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
index b0c487ec56..f0444b6acd 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
@@ -35,16 +35,20 @@
#ifdef __aarch64__
namespace
{
-void a64_add_bn_clamp_direct_fp32_2x16(
- float *out, size_t out_stride,
- float *out_direct, size_t out_direct_stride,
- const float *in0, size_t in0_stride,
- const float *in1, size_t in1_stride,
- const float *bn_mul,
- const float *bn_add,
- const float minval,
- const float maxval,
- size_t width, size_t height)
+void a64_add_bn_clamp_direct_fp32_2x16(float *out,
+ size_t out_stride,
+ float *out_direct,
+ size_t out_direct_stride,
+ const float *in0,
+ size_t in0_stride,
+ const float *in1,
+ size_t in1_stride,
+ const float *bn_mul,
+ const float *bn_add,
+ const float minval,
+ const float maxval,
+ size_t width,
+ size_t height)
{
struct KernelArgs
{
@@ -631,18 +635,30 @@ void a64_add_bn_clamp_direct_fp32_2x16(
"subs x20, x20, #0x2\n"
"bgt 8b\n"
"34:" // odd columns skip
- : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
- : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
-}
+ : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+ [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+ : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+ [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+ [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+ "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
}
+} // namespace
namespace arm_compute
{
namespace cpu
{
-void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
- ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_fp32_neon(const ITensor *input1,
+ const ITensor *input2,
+ const ITensor *bn_mul,
+ const ITensor *bn_add,
+ ITensor *add_output,
+ ITensor *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(policy);
@@ -654,16 +670,16 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I
float minval = std::numeric_limits<float>::lowest();
float maxval = std::numeric_limits<float>::max();
- if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+ if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
{
minval = 0.f;
}
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
{
minval = 0.f;
maxval = act_info.a();
}
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
{
minval = act_info.b();
maxval = act_info.a();
@@ -681,42 +697,34 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I
const size_t width = window.num_iterations(0);
const size_t height = window.num_iterations(1);
- if(add_output != nullptr)
+ if (add_output != nullptr)
{
Iterator add_out_it(add_output, window);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- a64_add_bn_clamp_direct_fp32_2x16(
- reinterpret_cast<float *>(out_it.ptr()), out_stride,
- reinterpret_cast<float *>(add_out_it.ptr()), out_direct_stride,
- reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
- reinterpret_cast<float *>(in2_it.ptr()), in1_stride,
- reinterpret_cast<float *>(bn_mul->buffer()),
- reinterpret_cast<float *>(bn_add->buffer()),
- minval,
- maxval,
- width, height);
- },
- in1_it, in2_it, add_out_it, out_it);
+ win,
+ [&](const Coordinates &)
+ {
+ a64_add_bn_clamp_direct_fp32_2x16(
+ reinterpret_cast<float *>(out_it.ptr()), out_stride, reinterpret_cast<float *>(add_out_it.ptr()),
+ out_direct_stride, reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
+ reinterpret_cast<float *>(in2_it.ptr()), in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+ reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+ },
+ in1_it, in2_it, add_out_it, out_it);
}
else
{
execute_window_loop(
- win, [&](const Coordinates &)
- {
- a64_add_bn_clamp_direct_fp32_2x16(
- reinterpret_cast<float *>(out_it.ptr()), out_stride,
- nullptr, out_direct_stride,
- reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
- reinterpret_cast<float *>(in2_it.ptr()), in1_stride,
- reinterpret_cast<float *>(bn_mul->buffer()),
- reinterpret_cast<float *>(bn_add->buffer()),
- minval,
- maxval,
- width, height);
- },
- in1_it, in2_it, out_it);
+ win,
+ [&](const Coordinates &)
+ {
+ a64_add_bn_clamp_direct_fp32_2x16(
+ reinterpret_cast<float *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+ reinterpret_cast<float *>(in1_it.ptr()), in0_stride, reinterpret_cast<float *>(in2_it.ptr()),
+ in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+ reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+ },
+ in1_it, in2_it, out_it);
}
}
} // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
index f7448a6717..035805c944 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
@@ -36,22 +36,30 @@
#ifdef __aarch64__
namespace
{
-void a64_add_bn_clamp_direct_u8_fp32_2x16(
- uint8_t *out, size_t out_stride,
- uint8_t *out_direct, size_t out_direct_stride,
- const uint8_t *in0, size_t in0_stride,
- const uint8_t *in1, size_t in1_stride,
- const float *bn_mul,
- const float *bn_add,
- const uint8_t minval,
- const uint8_t maxval,
- int32_t out_zeropt, float out_scale,
- int32_t out_direct_zeropt, float out_direct_scale,
- int32_t in0_zeropt, float in0_scale,
- int32_t in1_zeropt, float in1_scale,
- size_t width, size_t height)
+void a64_add_bn_clamp_direct_u8_fp32_2x16(uint8_t *out,
+ size_t out_stride,
+ uint8_t *out_direct,
+ size_t out_direct_stride,
+ const uint8_t *in0,
+ size_t in0_stride,
+ const uint8_t *in1,
+ size_t in1_stride,
+ const float *bn_mul,
+ const float *bn_add,
+ const uint8_t minval,
+ const uint8_t maxval,
+ int32_t out_zeropt,
+ float out_scale,
+ int32_t out_direct_zeropt,
+ float out_direct_scale,
+ int32_t in0_zeropt,
+ float in0_scale,
+ int32_t in1_zeropt,
+ float in1_scale,
+ size_t width,
+ size_t height)
{
- float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
+ float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
struct KernelArgs
{
const float *scales;
@@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_u8_fp32_2x16(
"subs x23, x23, #0x2\n"
"bgt 6b\n"
"32:" // odd columns skip
- : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
- : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+ : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+ [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+ : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+ [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+ [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+ [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+ [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+ [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+ [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+ [out_stride] "r"(out_stride)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+ "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
}
} // namespace
@@ -720,8 +738,15 @@ namespace arm_compute
{
namespace cpu
{
-void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
- ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_u8_neon(const ITensor *input1,
+ const ITensor *input2,
+ const ITensor *bn_mul,
+ const ITensor *bn_add,
+ ITensor *add_output,
+ ITensor *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(policy);
@@ -739,24 +764,25 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe
uint8_t maxval = std::numeric_limits<uint8_t>::max();
const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
- if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+ if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
{
minval = quantize_qasymm8(0.f, final_output_qinfo);
}
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
{
minval = quantize_qasymm8(0.f, final_output_qinfo);
maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
}
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
{
minval = quantize_qasymm8(act_info.b(), final_output_qinfo);
maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
}
- const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
- const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
- const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+ const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+ const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+ const UniformQuantizationInfo add_output_qinfo =
+ (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
const int32_t in1_offset = in1_qinfo.offset;
const int32_t in2_offset = in2_qinfo.offset;
@@ -783,50 +809,35 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe
const size_t width = window.num_iterations(0);
const size_t height = window.num_iterations(1);
- if(add_output != nullptr)
+ if (add_output != nullptr)
{
Iterator add_out_it(add_output, window);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- a64_add_bn_clamp_direct_u8_fp32_2x16(
- reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
- reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
- reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
- reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
- bn_mul_buffer,
- bn_add_buffer,
- minval,
- maxval,
- out_offset, out_scale,
- out_direct_offset, out_direct_scale,
- in1_offset, in1_scale,
- in2_offset, in2_scale,
- width, height);
- },
- in1_it, in2_it, add_out_it, out_it);
+ win,
+ [&](const Coordinates &)
+ {
+ a64_add_bn_clamp_direct_u8_fp32_2x16(
+ reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
+ reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
+ reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+ in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+ out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+ },
+ in1_it, in2_it, add_out_it, out_it);
}
else
{
execute_window_loop(
- win, [&](const Coordinates &)
- {
- a64_add_bn_clamp_direct_u8_fp32_2x16(
- reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
- nullptr, out_direct_stride,
- reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
- reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
- bn_mul_buffer,
- bn_add_buffer,
- minval,
- maxval,
- out_offset, out_scale,
- out_direct_offset, out_direct_scale,
- in1_offset, in1_scale,
- in2_offset, in2_scale,
- width, height);
- },
- in1_it, in2_it, out_it);
+ win,
+ [&](const Coordinates &)
+ {
+ a64_add_bn_clamp_direct_u8_fp32_2x16(
+ reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+ reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+ in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+ out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+ },
+ in1_it, in2_it, out_it);
}
}
} // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
index 1ae2cb76a9..e1a45b467b 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
@@ -36,22 +36,30 @@
#ifdef __aarch64__
namespace
{
-void a64_add_bn_clamp_direct_s8_fp32_2x16(
- int8_t *out, size_t out_stride,
- int8_t *out_direct, size_t out_direct_stride,
- const int8_t *in0, size_t in0_stride,
- const int8_t *in1, size_t in1_stride,
- const float *bn_mul,
- const float *bn_add,
- const int8_t minval,
- const int8_t maxval,
- int32_t out_zeropt, float out_scale,
- int32_t out_direct_zeropt, float out_direct_scale,
- int32_t in0_zeropt, float in0_scale,
- int32_t in1_zeropt, float in1_scale,
- size_t width, size_t height)
+void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t *out,
+ size_t out_stride,
+ int8_t *out_direct,
+ size_t out_direct_stride,
+ const int8_t *in0,
+ size_t in0_stride,
+ const int8_t *in1,
+ size_t in1_stride,
+ const float *bn_mul,
+ const float *bn_add,
+ const int8_t minval,
+ const int8_t maxval,
+ int32_t out_zeropt,
+ float out_scale,
+ int32_t out_direct_zeropt,
+ float out_direct_scale,
+ int32_t in0_zeropt,
+ float in0_scale,
+ int32_t in1_zeropt,
+ float in1_scale,
+ size_t width,
+ size_t height)
{
- float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
+ float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
struct KernelArgs
{
const float *scales;
@@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_s8_fp32_2x16(
"subs x23, x23, #0x2\n"
"bgt 6b\n"
"32:" // odd columns skip
- : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
- : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+ : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+ [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+ : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+ [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+ [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+ [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+ [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+ [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+ [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+ [out_stride] "r"(out_stride)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+ "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
}
} // namespace
@@ -720,8 +738,15 @@ namespace arm_compute
{
namespace cpu
{
-void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
- ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_s8_neon(const ITensor *input1,
+ const ITensor *input2,
+ const ITensor *bn_mul,
+ const ITensor *bn_add,
+ ITensor *add_output,
+ ITensor *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(policy);
@@ -739,24 +764,25 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe
int8_t maxval = std::numeric_limits<int8_t>::max();
const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
- if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+ if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
{
minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
}
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
{
minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
}
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
{
minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo);
maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
}
- const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
- const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
- const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+ const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+ const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+ const UniformQuantizationInfo add_output_qinfo =
+ (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
const int32_t in1_offset = in1_qinfo.offset;
const int32_t in2_offset = in2_qinfo.offset;
@@ -783,50 +809,35 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe
const size_t width = window.num_iterations(0);
const size_t height = window.num_iterations(1);
- if(add_output != nullptr)
+ if (add_output != nullptr)
{
Iterator add_out_it(add_output, window);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- a64_add_bn_clamp_direct_s8_fp32_2x16(
- reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
- reinterpret_cast<int8_t *>(add_out_it.ptr()), out_direct_stride,
- reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
- reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
- bn_mul_buffer,
- bn_add_buffer,
- minval,
- maxval,
- out_offset, out_scale,
- out_direct_offset, out_direct_scale,
- in1_offset, in1_scale,
- in2_offset, in2_scale,
- width, height);
- },
- in1_it, in2_it, add_out_it, out_it);
+ win,
+ [&](const Coordinates &)
+ {
+ a64_add_bn_clamp_direct_s8_fp32_2x16(
+ reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, reinterpret_cast<int8_t *>(add_out_it.ptr()),
+ out_direct_stride, reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
+ reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval,
+ out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset,
+ in2_scale, width, height);
+ },
+ in1_it, in2_it, add_out_it, out_it);
}
else
{
execute_window_loop(
- win, [&](const Coordinates &)
- {
- a64_add_bn_clamp_direct_s8_fp32_2x16(
- reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
- nullptr, out_direct_stride,
- reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
- reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
- bn_mul_buffer,
- bn_add_buffer,
- minval,
- maxval,
- out_offset, out_scale,
- out_direct_offset, out_direct_scale,
- in1_offset, in1_scale,
- in2_offset, in2_scale,
- width, height);
- },
- in1_it, in2_it, out_it);
+ win,
+ [&](const Coordinates &)
+ {
+ a64_add_bn_clamp_direct_s8_fp32_2x16(
+ reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+ reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<int8_t *>(in2_it.ptr()),
+ in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+ out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+ },
+ in1_it, in2_it, out_it);
}
}
} // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/list.h b/src/cpu/kernels/addmuladd/list.h
index a7c22c06d8..568003a916 100644
--- a/src/cpu/kernels/addmuladd/list.h
+++ b/src/cpu/kernels/addmuladd/list.h
@@ -32,9 +32,10 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_ADD_MUL_ADD_KERNEL(func_name) \
+#define DECLARE_ADD_MUL_ADD_KERNEL(func_name) \
void func_name(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, \
- ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+ ITensor *add_output, ITensor *final_output, ConvertPolicy policy, \
+ const ActivationLayerInfo &act_info, const Window &window)
DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp32_neon);
DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp16_neon);
diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
index 10bf8e4ff7..6e8f32ef47 100644
--- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/NEON/INEKernel.h"
#include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
@@ -57,13 +58,12 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel
public:
/** Constructor
*/
- CpuGemmAssemblyWrapperKernel()
- : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
+ CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
{
}
- CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete;
- CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
+ CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete;
+ CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
const char *name() const override
@@ -110,7 +110,7 @@ public:
INEKernel::configure(win);
- if(!kernel_name_tag.empty())
+ if (!kernel_name_tag.empty())
{
_name += "/" + kernel_name_tag;
}
@@ -132,7 +132,7 @@ public:
private:
arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
- std::string _name;
+ std::string _name;
};
} // namespace kernel
} // namespace cpu
diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp
index 4c127b4ec3..9a913c5c58 100644
--- a/src/cpu/kernels/assembly/arm_gemm.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm.hpp
@@ -23,13 +23,12 @@
*/
#pragma once
+#include "arm_gemm_local.hpp"
+#include "gemm_common.hpp"
#include <cstring>
#include <memory>
#include <vector>
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
namespace arm_gemm
{
enum class GemmMethod
@@ -111,8 +110,7 @@ struct GemmConfig
unsigned int outer_block_size = 0;
WeightFormat weight_format = WeightFormat::ANY;
- GemmConfig(GemmMethod method)
- : method(method)
+ GemmConfig(GemmMethod method) : method(method)
{
}
GemmConfig()
@@ -133,8 +131,7 @@ struct Activation
float param1;
float param2;
- Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
- : type(type), param1(p1), param2(p2)
+ Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2)
{
}
};
@@ -156,12 +153,32 @@ public:
bool _fast_mode;
const GemmConfig *_cfg;
- GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
- unsigned int K, unsigned int Ksections, unsigned int nbatches,
- unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
- bool fixed_format = false, bool fast_mode = false, const GemmConfig *cfg = nullptr)
- : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads),
- _fixed_format(fixed_format), _fast_mode(fast_mode), _cfg(cfg)
+ GemmArgs(const CPUInfo *ci,
+ unsigned int M,
+ unsigned int N,
+ unsigned int K,
+ unsigned int Ksections,
+ unsigned int nbatches,
+ unsigned int nmulti,
+ bool indirect_input,
+ Activation act,
+ const int maxthreads,
+ bool fixed_format = false,
+ bool fast_mode = false,
+ const GemmConfig *cfg = nullptr)
+ : _ci(ci),
+ _Msize(M),
+ _Nsize(N),
+ _Ksize(K),
+ _Ksections(Ksections),
+ _nbatches(nbatches),
+ _nmulti(nmulti),
+ _indirect_input(indirect_input),
+ _act(act),
+ _maxthreads(maxthreads),
+ _fixed_format(fixed_format),
+ _fast_mode(fast_mode),
+ _cfg(cfg)
{
}
};
@@ -187,23 +204,51 @@ public:
Requantize32() = default;
// Constructor for per-tensor quantization
- Requantize32(const int32_t *bias, size_t bias_multi_stride,
- int32_t a_offset, int32_t b_offset, int32_t c_offset,
- int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
- : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
- per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
+ Requantize32(const int32_t *bias,
+ size_t bias_multi_stride,
+ int32_t a_offset,
+ int32_t b_offset,
+ int32_t c_offset,
+ int32_t requant_shift,
+ int32_t requant_mul,
+ int32_t minv,
+ int32_t maxv)
+ : bias(bias),
+ bias_multi_stride(bias_multi_stride),
+ a_offset(a_offset),
+ b_offset(b_offset),
+ c_offset(c_offset),
+ per_channel_requant(false),
+ per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
+ per_layer_right_shift(std::min<int32_t>(requant_shift, 0)),
+ per_layer_mul(requant_mul),
+ minval(minv),
+ maxval(maxv)
{
}
// Constructor for per-channel quantization
- Requantize32(const int32_t *bias, size_t bias_multi_stride,
- int32_t a_offset, int32_t b_offset, int32_t c_offset,
+ Requantize32(const int32_t *bias,
+ size_t bias_multi_stride,
+ int32_t a_offset,
+ int32_t b_offset,
+ int32_t c_offset,
const int32_t *requant_left_shifts,
const int32_t *requant_right_shifts,
const int32_t *requant_muls,
- int32_t minv, int32_t maxv)
- : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts),
- per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv)
+ int32_t minv,
+ int32_t maxv)
+ : bias(bias),
+ bias_multi_stride(bias_multi_stride),
+ a_offset(a_offset),
+ b_offset(b_offset),
+ c_offset(c_offset),
+ per_channel_requant(true),
+ per_channel_left_shifts(requant_left_shifts),
+ per_channel_right_shifts(requant_right_shifts),
+ per_channel_muls(requant_muls),
+ minval(minv),
+ maxval(maxv)
{
}
};
diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
index 718fcd1fb4..0672e899b6 100644
--- a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
@@ -27,7 +27,6 @@
#include "arm_compute/core/Window.h"
#include "ndrange.hpp"
-
#include <cassert>
/* This file contains mapping between integral types used in arm_compute and arm_gemm
@@ -38,8 +37,7 @@
namespace arm_gemm
{
//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
- arm_compute::Dimensions<unsigned int>::num_max_dimensions;
+constexpr std::size_t ndrange_max = arm_compute::Dimensions<unsigned int>::num_max_dimensions;
using ndrange_t = NDRange<ndrange_max>;
using ndcoord_t = NDCoordinate<ndrange_max>;
@@ -56,7 +54,7 @@ inline arm_compute::Window to_window(const ndrange_t &ndr)
{
arm_compute::Window win;
- for(unsigned int i = 0; i != ndrange_max; ++i)
+ for (unsigned int i = 0; i != ndrange_max; ++i)
{
//populate the window with the dimensions of the NDRange
win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
@@ -75,7 +73,7 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc)
{
arm_compute::Window win;
- for(unsigned int i = 0; i != ndrange_max; ++i)
+ for (unsigned int i = 0; i != ndrange_max; ++i)
{
const auto start = ndc.get_position(i);
const auto size = ndc.get_size(i);
@@ -98,15 +96,12 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc)
*/
inline ndrange_t to_ndrange(const arm_compute::Window &win)
{
- return
- {
- static_cast<unsigned int>(win[0].end() - win[0].start()),
- static_cast<unsigned int>(win[1].end() - win[1].start()),
- static_cast<unsigned int>(win[2].end() - win[2].start()),
- static_cast<unsigned int>(win[3].end() - win[3].start()),
- static_cast<unsigned int>(win[4].end() - win[4].start()),
- static_cast<unsigned int>(win[5].end() - win[5].start())
- };
+ return {static_cast<unsigned int>(win[0].end() - win[0].start()),
+ static_cast<unsigned int>(win[1].end() - win[1].start()),
+ static_cast<unsigned int>(win[2].end() - win[2].start()),
+ static_cast<unsigned int>(win[3].end() - win[3].start()),
+ static_cast<unsigned int>(win[4].end() - win[4].start()),
+ static_cast<unsigned int>(win[5].end() - win[5].start())};
}
/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
@@ -116,15 +111,12 @@ inline ndrange_t to_ndrange(const arm_compute::Window &win)
*/
inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
{
- return
- {
- { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
- { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
- { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
- { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
- { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
- { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
- };
+ return {{static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start())},
+ {static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start())},
+ {static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start())},
+ {static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start())},
+ {static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start())},
+ {static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start())}};
}
} //namespace arm_gemm
diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index 834cd1061e..6fe9f13f02 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -25,7 +25,6 @@
#include "convolution_parameters.hpp"
#include "ndrange.hpp"
-
#include <cstddef>
namespace arm_gemm
@@ -51,10 +50,19 @@ public:
* appropriately typed pointers. If B is pretransposed (see below) then
* the settings for B here are ignored.
*/
- virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
+ virtual void set_arrays_generic(const void *A,
+ const int lda,
+ const int A_batch_stride,
+ const int A_multi_stride,
+ const void *B,
+ const int ldb,
+ /* batches share B */ const int B_multi_stride,
+ void *C,
+ const int ldc,
+ const int C_batch_stride,
+ const int C_multi_stride,
+ const void *bias,
+ /* no row or batch stride needed */ const int bias_multi_stride) = 0;
/** @returns an ndrange containing ranges of the compute space which can be
* broken up and parallelised over
@@ -73,7 +81,7 @@ public:
* This has an empty default implementation, as GEMMs which don't care
* about thread count can safely ignore this.
*/
- virtual void set_nthreads(int) {};
+ virtual void set_nthreads(int){};
/* Whether this GEMM can be dynamically scheduled or not. */
virtual bool supports_dynamic_scheduling() const
@@ -95,7 +103,7 @@ public:
return 0;
}
/* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
- virtual void set_working_space(void *) {};
+ virtual void set_working_space(void *){};
/*** "Pretransposed" interface (optional) ***/
/* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */
@@ -122,7 +130,8 @@ public:
/* The "real" version of this depends on the templated operand type (see below). */
virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
/* Threaded version with window start/end parameters */
- virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
+ virtual void
+ pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
/* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
virtual void set_pretransposed_B_data(void *)
@@ -186,10 +195,19 @@ protected:
public:
/* Pass in the pointers to the arrays to be operated on and their
* strides (templated version with appropriate types). */
- virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
- Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
+ virtual void set_arrays(const To *A,
+ const int lda,
+ const int A_batch_stride,
+ const int A_multi_stride,
+ const To *B,
+ const int ldb,
+ /* batches share B */ const int B_multi_stride,
+ Tr *C,
+ const int ldc,
+ const int C_batch_stride,
+ const int C_multi_stride,
+ const Tr *bias,
+ /* no row or batch stride needed */ const int bias_multi_stride)
{
_Aptr = A;
_lda = lda;
@@ -207,25 +225,33 @@ public:
}
/* Implementation of the void * overload which casts its arguments to the appropriate type. */
- void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
+ void set_arrays_generic(const void *A,
+ const int lda,
+ const int A_batch_stride,
+ const int A_multi_stride,
+ const void *B,
+ const int ldb,
+ /* batches share B */ const int B_multi_stride,
+ void *C,
+ const int ldc,
+ const int C_batch_stride,
+ const int C_multi_stride,
+ const void *bias,
+ /* no row or batch stride needed */ const int bias_multi_stride) override
{
- set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
- static_cast<const To *>(B), ldb, B_multi_stride,
- static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
+ set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb,
+ B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
static_cast<const Tr *>(bias), bias_multi_stride);
}
/*** "Pretransposed" interface ***/
/* Compute col sums over all columns */
- virtual void requantize_bias(void *, const To *, const int, const int) {};
+ virtual void requantize_bias(void *, const To *, const int, const int){};
/* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
/* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
- virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
+ virtual void pretranspose_B_array(void *, const To *, const int, const int){};
/* Implementation of the void * overload which casts its arguments to the appropriate type. */
void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
@@ -237,12 +263,14 @@ public:
* The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and
* just calls the non-threaded functions to do the work. This is valid as with window size of 1 the only
* legal values for start and end are 0 and 1 respectively. */
- virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
+ virtual void
+ pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
{
pretranspose_B_array(out, in, row_stride, multi_stride);
};
- void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
+ void pretranspose_B_array_part_generic(
+ void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
{
pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end);
}
diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp
index 1c8261aef7..baccdc0d88 100644
--- a/src/cpu/kernels/assembly/ndrange.hpp
+++ b/src/cpu/kernels/assembly/ndrange.hpp
@@ -45,8 +45,7 @@ private:
unsigned int m_end = 0;
public:
- NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
- : m_parent(p), m_pos(s), m_end(e)
+ NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e)
{
}
@@ -59,12 +58,12 @@ private:
{
unsigned int r = m_pos;
- if(d < (D - 1))
+ if (d < (D - 1))
{
r %= m_parent.m_totalsizes[d];
}
- if(d > 0)
+ if (d > 0)
{
r /= m_parent.m_totalsizes[d - 1];
}
@@ -98,9 +97,9 @@ private:
{
unsigned int t = 1;
- for(unsigned int i = 0; i < D; i++)
+ for (unsigned int i = 0; i < D; i++)
{
- if(m_sizes[i] == 0)
+ if (m_sizes[i] == 0)
{
m_sizes[i] = 1;
}
@@ -116,14 +115,12 @@ public:
NDRange(const NDRange &rhs) = default;
template <typename... T>
- NDRange(T... ts)
- : m_sizes{ ts... }
+ NDRange(T... ts) : m_sizes{ts...}
{
set_totalsizes();
}
- NDRange(const std::array<unsigned int, D> &n)
- : m_sizes(n)
+ NDRange(const std::array<unsigned int, D> &n) : m_sizes(n)
{
set_totalsizes();
}
@@ -163,7 +160,7 @@ public:
std::array<int_t, N> sizes{};
std::size_t i = 0;
- for(auto &p : list)
+ for (auto &p : list)
{
m_positions[i] = p.first;
sizes[i++] = p.second;
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
index 5661479059..dbdec5fb50 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
@@ -29,7 +29,11 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_fp16_boundingboxtransform(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ BoundingBoxTransformInfo bbinfo,
+ const Window &window)
{
return bounding_box_transform<float16_t>(boxes, pred_boxes, deltas, bbinfo, window);
}
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
index 34ff9224d5..0224b3406a 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
@@ -26,7 +26,11 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp32_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_fp32_boundingboxtransform(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ BoundingBoxTransformInfo bbinfo,
+ const Window &window)
{
return bounding_box_transform<float>(boxes, pred_boxes, deltas, bbinfo, window);
}
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
index b3ffd0a676..5a2939b587 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
@@ -29,7 +29,11 @@ namespace arm_compute
{
namespace cpu
{
-void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void bounding_box_transform_qsymm16(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ BoundingBoxTransformInfo bbinfo,
+ const Window &window)
{
const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2;
@@ -41,7 +45,8 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c
const auto scale_before = bbinfo.scale();
const auto offset = (bbinfo.correct_transform_coords() ? 1.f : 0.f);
- auto pred_ptr = reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
+ auto pred_ptr =
+ reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
auto delta_ptr = reinterpret_cast<uint8_t *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
const auto boxes_qinfo = boxes->info()->quantization_info().uniform();
@@ -49,41 +54,49 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c
const auto pred_qinfo = pred_boxes->info()->quantization_info().uniform();
Iterator box_it(boxes, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto ptr = reinterpret_cast<uint16_t *>(box_it.ptr());
- const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo);
- const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
- const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
- const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
- const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f;
- const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
- const float ctr_x = (b0 / scale_before) + 0.5f * width;
- const float ctr_y = (b1 / scale_before) + 0.5f * height;
- for(size_t j = 0; j < num_classes; ++j)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- // Extract deltas
- const size_t delta_id = id.y() * deltas_width + 4u * j;
- const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
- const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
- float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
- float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
- // Clip dw and dh
- dw = std::min(dw, bbinfo.bbox_xform_clip());
- dh = std::min(dh, bbinfo.bbox_xform_clip());
- // Determine the predictions
- const float pred_ctr_x = dx * width + ctr_x;
- const float pred_ctr_y = dy * height + ctr_y;
- const float pred_w = std::exp(dw) * width;
- const float pred_h = std::exp(dh) * height;
- // Store the prediction into the output tensor
- pred_ptr[delta_id] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
- pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
- pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
- pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
- }
- },
- box_it);
+ const auto ptr = reinterpret_cast<uint16_t *>(box_it.ptr());
+ const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo);
+ const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
+ const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
+ const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
+ const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f;
+ const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
+ const float ctr_x = (b0 / scale_before) + 0.5f * width;
+ const float ctr_y = (b1 / scale_before) + 0.5f * height;
+ for (size_t j = 0; j < num_classes; ++j)
+ {
+ // Extract deltas
+ const size_t delta_id = id.y() * deltas_width + 4u * j;
+ const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
+ const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
+ float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
+ float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
+ // Clip dw and dh
+ dw = std::min(dw, bbinfo.bbox_xform_clip());
+ dh = std::min(dh, bbinfo.bbox_xform_clip());
+ // Determine the predictions
+ const float pred_ctr_x = dx * width + ctr_x;
+ const float pred_ctr_y = dy * height + ctr_y;
+ const float pred_w = std::exp(dw) * width;
+ const float pred_h = std::exp(dh) * height;
+ // Store the prediction into the output tensor
+ pred_ptr[delta_id] = quantize_qasymm16(
+ scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
+ pred_ptr[delta_id + 1] = quantize_qasymm16(
+ scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
+ pred_ptr[delta_id + 2] = quantize_qasymm16(
+ scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f),
+ pred_qinfo);
+ pred_ptr[delta_id + 3] = quantize_qasymm16(
+ scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f),
+ pred_qinfo);
+ }
+ },
+ box_it);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
index 7f990396df..d8013c6227 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
@@ -30,7 +30,11 @@ namespace arm_compute
namespace cpu
{
template <typename T>
-void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void bounding_box_transform(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ BoundingBoxTransformInfo bbinfo,
+ const Window &window)
{
const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2;
const size_t deltas_width = deltas->info()->tensor_shape()[0];
@@ -46,44 +50,53 @@ void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITe
auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
Iterator box_it(boxes, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto ptr = reinterpret_cast<T *>(box_it.ptr());
- const auto b0 = *ptr;
- const auto b1 = *(ptr + 1);
- const auto b2 = *(ptr + 2);
- const auto b3 = *(ptr + 3);
- const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
- const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
- const T ctr_x = (b0 / scale_before) + T(0.5f) * width;
- const T ctr_y = (b1 / scale_before) + T(0.5f) * height;
- for(size_t j = 0; j < num_classes; ++j)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- // Extract deltas
- const size_t delta_id = id.y() * deltas_width + 4u * j;
- const T dx = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
- const T dy = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
- T dw = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
- T dh = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
- // Clip dw and dh
- dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
- dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
- // Determine the predictions
- const T pred_ctr_x = dx * width + ctr_x;
- const T pred_ctr_y = dy * height + ctr_y;
- const T pred_w = std::exp(dw) * width;
- const T pred_h = std::exp(dh) * height;
- // Store the prediction into the output tensor
- pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
- pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
- pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
- pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
- }
- },
- box_it);
+ const auto ptr = reinterpret_cast<T *>(box_it.ptr());
+ const auto b0 = *ptr;
+ const auto b1 = *(ptr + 1);
+ const auto b2 = *(ptr + 2);
+ const auto b3 = *(ptr + 3);
+ const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
+ const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
+ const T ctr_x = (b0 / scale_before) + T(0.5f) * width;
+ const T ctr_y = (b1 / scale_before) + T(0.5f) * height;
+ for (size_t j = 0; j < num_classes; ++j)
+ {
+ // Extract deltas
+ const size_t delta_id = id.y() * deltas_width + 4u * j;
+ const T dx = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
+ const T dy = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
+ T dw = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
+ T dh = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
+ // Clip dw and dh
+ dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
+ dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
+ // Determine the predictions
+ const T pred_ctr_x = dx * width + ctr_x;
+ const T pred_ctr_y = dy * height + ctr_y;
+ const T pred_w = std::exp(dw) * width;
+ const T pred_h = std::exp(dh) * height;
+ // Store the prediction into the output tensor
+ pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
+ pred_ptr[delta_id + 1] =
+ scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
+ pred_ptr[delta_id + 2] =
+ scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
+ pred_ptr[delta_id + 3] =
+ scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
+ }
+ },
+ box_it);
}
-void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
+void bounding_box_transform_qsymm16(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ BoundingBoxTransformInfo bbinfo,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
#endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
index b27c187df3..64ef815195 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
@@ -26,7 +26,11 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qu16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_qu16_boundingboxtransform(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ BoundingBoxTransformInfo bbinfo,
+ const Window &window)
{
return bounding_box_transform_qsymm16(boxes, pred_boxes, deltas, bbinfo, window);
}
diff --git a/src/cpu/kernels/boundingboxtransform/list.h b/src/cpu/kernels/boundingboxtransform/list.h
index 8f06acc8a6..4da725a257 100644
--- a/src/cpu/kernels/boundingboxtransform/list.h
+++ b/src/cpu/kernels/boundingboxtransform/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \
- void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \
+ void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, \
+ const Window &window)
DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp32_boundingboxtransform);
DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp16_boundingboxtransform);
DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_qu16_boundingboxtransform);
diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp
index 6cd0c8500b..2897f4b242 100644
--- a/src/cpu/kernels/cast/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp
@@ -25,8 +25,9 @@
#include "arm_compute/core/CPP/CPPTypes.h"
#include "arm_compute/core/TensorInfo.h"
-#include "src/cpu/kernels/CpuCastKernel.h"
+
#include "src/cpu/kernels/cast/list.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
#include "support/SaturateCast.h"
#include "arm_neon.h"
@@ -35,7 +36,8 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_qasymm8_signed_to_fp16_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -49,42 +51,39 @@ void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+ int x = window_start_x;
- const int16x8x2_t texels =
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vmovl_s8(vget_low_s8(texels_s8)),
- vmovl_s8(vget_high_s8(texels_s8))
- }
- };
- vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
- vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
- }
+ const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+ vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+ vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
-void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_s32_to_fp16_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -98,44 +97,41 @@ void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
- vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
- vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
- vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
- }
- };
-
- vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
- vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
- }
+ const float32x4x4_t texels = {
+ {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
+ vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+ vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
-void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp32_to_fp16_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -149,44 +145,40 @@ void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float32x4x4_t texels =
+ const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_f32(src_ptr + x),
- vld1q_f32(src_ptr + x + 4),
- vld1q_f32(src_ptr + x + 8),
- vld1q_f32(src_ptr + x + 12)
- }
- };
-
- vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
- vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
- }
+ const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4),
+ vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}};
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+ vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
}
-void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp16_to_other_dt_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -200,142 +192,133 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
- switch(_dst->info()->data_type())
+ switch (_dst->info()->data_type())
{
case DataType::QASYMM8_SIGNED:
{
/* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const float16x8x2_t texels = {{
vld1q_f16(src_ptr + x),
vld1q_f16(src_ptr + x + 8),
- }
- };
+ }};
- vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
- }
+ vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])),
+ vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::QASYMM8:
case DataType::U8:
{
/* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const float16x8x2_t texels = {{
vld1q_f16(src_ptr + x),
vld1q_f16(src_ptr + x + 8),
- }
- };
+ }};
- vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
+ vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])),
+ vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
+ }
- },
- src, dst);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::F32:
{
/* Up-conversion F16 -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_f16(src_ptr + x),
- vld1q_f16(src_ptr + x + 8)
- }
- };
- vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
- vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
- vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
- vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
- }
- },
- src, dst);
+ const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+ vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
+ vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
+ vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
+ vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
case DataType::S32:
{
/* Up-conversion F16 -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float16x8x2_t texels =
+ const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vld1q_f16(src_ptr + x),
- vld1q_f16(src_ptr + x + 8)
- }
- };
-
- vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
- vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
- vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
- vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+
+ vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
+ vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
+ vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
+ vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
break;
}
default:
@@ -343,7 +326,8 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
}
}
-void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_u8_to_fp16_cast(
+ const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_UNUSED(_policy);
@@ -357,40 +341,37 @@ void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &
ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator src(_src, win);
Iterator dst(_dst, win);
/* Up-conversion U8 -> F16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+ const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
- const int16x8x2_t texels =
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
- }
- };
- vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
- vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
- }
+ const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
+ const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+ vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+ vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+ vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+ }
+ },
+ src, dst);
return;
}
diff --git a/src/cpu/kernels/cast/list.h b/src/cpu/kernels/cast/list.h
index ffd82d5bf3..5e634fc170 100644
--- a/src/cpu/kernels/cast/list.h
+++ b/src/cpu/kernels/cast/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_CAST_KERNEL(func_name) \
- void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
+#define DECLARE_CAST_KERNEL(func_name) \
+ void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, \
+ const Window &window)
DECLARE_CAST_KERNEL(neon_fp32_to_fp16_cast);
DECLARE_CAST_KERNEL(neon_u8_to_fp16_cast);
@@ -41,4 +42,4 @@ DECLARE_CAST_KERNEL(neon_bfloat16_to_fp32_cast);
#undef DECLARE_CAST_KERNEL
} // namespace cpu
} // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H \ No newline at end of file
+#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H
diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/neon/list.h
index 3bfa124dc3..082c60be29 100644
--- a/src/cpu/kernels/conv3d/neon/list.h
+++ b/src/cpu/kernels/conv3d/neon/list.h
@@ -27,8 +27,9 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/conv3d/neon/quantized.h"
namespace arm_compute
@@ -36,7 +37,12 @@ namespace arm_compute
namespace cpu
{
template <typename T>
-void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
+void directconv3d_float_neon_ndhwc(const ITensor *src0,
+ const ITensor *src1,
+ const ITensor *src2,
+ ITensor *dst,
+ const Conv3dInfo &conv_info,
+ const Window &window)
{
const ITensor *src = src0;
const ITensor *weights = src1;
@@ -88,91 +94,104 @@ void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, con
Iterator wei(weights, window_w);
const T *biases_ptr = nullptr;
- if(biases != nullptr)
+ if (biases != nullptr)
{
biases_ptr = reinterpret_cast<T *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
}
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // We are computing the theoretical input starting points
- const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
- const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
- const int in_d_end_t = in_d_start_t + kernel_dim_d;
-
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_d_start = std::max(in_d_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
- const int in_d_end = std::min(in_d_end_t, input_dim_d);
-
- // We use the input points to select the valid weight points to use
- const int wei_w_start = in_w_start - in_w_start_t;
- const int wei_h_start = in_h_start - in_h_start_t;
- const int wei_d_start = in_d_start - in_d_start_t;
- const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
- const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
- const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end);
-
- const int index_c_out_end = weights->info()->dimension(0);
- const int index_c_in_end = weights->info()->dimension(1);
- const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n;
-
- execute_window_loop(window_w, [&](const Coordinates & id_w)
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
{
- /*
+ // We are computing the theoretical input starting points
+ const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+ const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+ const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+ const int in_w_end_t = in_w_start_t + kernel_dim_w;
+ const int in_h_end_t = in_h_start_t + kernel_dim_h;
+ const int in_d_end_t = in_d_start_t + kernel_dim_d;
+
+ // We are computing the valid initial and ending input points by checking the borders
+ const int in_w_start = std::max(in_w_start_t, 0);
+ const int in_h_start = std::max(in_h_start_t, 0);
+ const int in_d_start = std::max(in_d_start_t, 0);
+ const int in_w_end = std::min(in_w_end_t, input_dim_w);
+ const int in_h_end = std::min(in_h_end_t, input_dim_h);
+ const int in_d_end = std::min(in_d_end_t, input_dim_d);
+
+ // We use the input points to select the valid weight points to use
+ const int wei_w_start = in_w_start - in_w_start_t;
+ const int wei_h_start = in_h_start - in_h_start_t;
+ const int wei_d_start = in_d_start - in_d_start_t;
+ const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
+ const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
+ const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end);
+
+ const int index_c_out_end = weights->info()->dimension(0);
+ const int index_c_in_end = weights->info()->dimension(1);
+ const T *const in_ptr_start =
+ reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+ id[4] * input_stride_n;
+
+ execute_window_loop(
+ window_w,
+ [&](const Coordinates &id_w)
+ {
+ /*
* This is the loop in the weights, and it goes along OFM (output feature map)
*/
- const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
- T out_temp = static_cast<T>(0);
- T *out_ptr = reinterpret_cast<T *>(out.ptr());
- for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d)
- {
- const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d;
- const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
- for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
- {
- const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h;
- const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
- for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+ const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+ T out_temp = static_cast<T>(0);
+ T *out_ptr = reinterpret_cast<T *>(out.ptr());
+ for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+ ++index_wei_d, ++index_in_d)
{
- const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
- const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
- int index_c_in = 0;
- vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
- index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+ const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d;
+ const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+ for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+ ++index_wei_h, ++index_in_h)
{
- const auto src_vec = wrapper::vloadq(in_ptr_mover);
- //Load Cin weights
- for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+ const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h;
+ const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+ for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+ ++index_wei_w, ++index_in_w)
{
- w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+ const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
+ const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+ int index_c_in = 0;
+ vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+ vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+ for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+ index_c_in += num_elems_read_per_iteration,
+ in_ptr_mover += num_elems_read_per_iteration)
+ {
+ const auto src_vec = wrapper::vloadq(in_ptr_mover);
+ //Load Cin weights
+ for (int k = 0; k < num_elems_read_per_iteration;
+ ++k, weights_ptr_mover += index_c_out_end)
+ {
+ w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+ }
+ out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+ }
+ out_temp += vreduce(out_temp_vec);
+ for (; index_c_in < index_c_in_end;
+ ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+ {
+ const auto src_val = *(in_ptr_mover);
+ const auto w_val = *(weights_ptr_mover);
+ out_temp += src_val * w_val;
+ }
}
- out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
- }
- out_temp += vreduce(out_temp_vec);
- for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
- {
- const auto src_val = *(in_ptr_mover);
- const auto w_val = *(weights_ptr_mover);
- out_temp += src_val * w_val;
}
}
- }
- }
- *(reinterpret_cast<T *>(out_ptr + id_w[0])) = (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp;
+ *(reinterpret_cast<T *>(out_ptr + id_w[0])) =
+ (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp;
+ },
+ wei);
},
- wei);
- },
- out);
+ out);
}
} // namespace cpu
} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H \ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/neon/quantized.h
index a8165b4944..f0fc9b5a71 100644
--- a/src/cpu/kernels/conv3d/neon/quantized.h
+++ b/src/cpu/kernels/conv3d/neon/quantized.h
@@ -28,16 +28,22 @@
#include "arm_compute/core/utils/misc/Traits.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/core/helpers/WindowHelpers.h"
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
namespace arm_compute
{
namespace cpu
{
template <typename T>
-void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
+void directconv3d_quantized_neon_ndhwc(const ITensor *src0,
+ const ITensor *src1,
+ const ITensor *src2,
+ ITensor *dst,
+ const Conv3dInfo &conv_info,
+ const Window &window)
{
const ITensor *src = src0;
const ITensor *weights = src1;
@@ -104,153 +110,166 @@ void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1,
Iterator wei(weights, window_w);
const int32_t *biases_ptr = nullptr;
- if(biases != nullptr)
+ if (biases != nullptr)
{
biases_ptr = reinterpret_cast<int32_t *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
}
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // We are computing the theoretical input starting points
- const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
- const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
- const int in_d_end_t = in_d_start_t + kernel_dim_d;
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
+ {
+ // We are computing the theoretical input starting points
+ const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+ const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+ const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+ const int in_w_end_t = in_w_start_t + kernel_dim_w;
+ const int in_h_end_t = in_h_start_t + kernel_dim_h;
+ const int in_d_end_t = in_d_start_t + kernel_dim_d;
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_d_start = std::max(in_d_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
- const int in_d_end = std::min(in_d_end_t, input_dim_d);
+ // We are computing the valid initial and ending input points by checking the borders
+ const int in_w_start = std::max(in_w_start_t, 0);
+ const int in_h_start = std::max(in_h_start_t, 0);
+ const int in_d_start = std::max(in_d_start_t, 0);
+ const int in_w_end = std::min(in_w_end_t, input_dim_w);
+ const int in_h_end = std::min(in_h_end_t, input_dim_h);
+ const int in_d_end = std::min(in_d_end_t, input_dim_d);
- // We use the input points to select the valid weight points to use
- const int wei_w_start = in_w_start - in_w_start_t;
- const int wei_h_start = in_h_start - in_h_start_t;
- const int wei_d_start = in_d_start - in_d_start_t;
- const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
- const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
- const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end);
+ // We use the input points to select the valid weight points to use
+ const int wei_w_start = in_w_start - in_w_start_t;
+ const int wei_h_start = in_h_start - in_h_start_t;
+ const int wei_d_start = in_d_start - in_d_start_t;
+ const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
+ const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
+ const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end);
- const int index_c_out_end = weights->info()->dimension(0);
- const int index_c_in_end = weights->info()->dimension(1);
- const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n;
+ const int index_c_out_end = weights->info()->dimension(0);
+ const int index_c_in_end = weights->info()->dimension(1);
+ const T *const in_ptr_start =
+ reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+ id[4] * input_stride_n;
- execute_window_loop(window_w, [&](const Coordinates & id_w)
- {
- /*
+ execute_window_loop(
+ window_w,
+ [&](const Coordinates &id_w)
+ {
+ /*
* This is the loop in the weights, and it goes along OFM (output feature map)
*/
- const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
- int32_t acc = static_cast<int32_t>(0);
- T *out_ptr = reinterpret_cast<T *>(out.ptr());
- for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d)
- {
- const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d;
- const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
- for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
- {
- const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h;
- const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
- for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+ const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+ int32_t acc = static_cast<int32_t>(0);
+ T *out_ptr = reinterpret_cast<T *>(out.ptr());
+ for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+ ++index_wei_d, ++index_in_d)
{
- const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
- const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
- int index_c_in = 0;
- vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
-
- q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
- q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
- q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
- q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-
- for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
- index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+ const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d;
+ const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+ for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+ ++index_wei_h, ++index_in_h)
{
- const auto src_vec = wrapper::vloadq(in_ptr_mover);
- //Load Cin weights
- for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+ const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h;
+ const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+ for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+ ++index_wei_w, ++index_in_w)
{
- w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
- }
- q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
- q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
- q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
- q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+ const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
+ const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+ int index_c_in = 0;
+ vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
- q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
- q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
- q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+ q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+ q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+ q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+ q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
- const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec));
- const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec));
- const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec));
- const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec));
+ for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+ index_c_in += num_elems_read_per_iteration,
+ in_ptr_mover += num_elems_read_per_iteration)
+ {
+ const auto src_vec = wrapper::vloadq(in_ptr_mover);
+ //Load Cin weights
+ for (int k = 0; k < num_elems_read_per_iteration;
+ ++k, weights_ptr_mover += index_c_out_end)
+ {
+ w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+ }
+ q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+ q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+ q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+ q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
- src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0)));
- src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0)));
- src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1)));
- src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1)));
+ q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+ q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+ q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+ q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
- wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0)));
- wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0)));
- wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1)));
- wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1)));
+ const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec));
+ const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec));
+ const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec));
+ const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec));
- acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0);
- acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1);
- acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2);
- acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3);
- }
+ src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0)));
+ src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0)));
+ src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1)));
+ src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1)));
+
+ wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0)));
+ wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0)));
+ wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1)));
+ wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1)));
+
+ acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0);
+ acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1);
+ acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2);
+ acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3);
+ }
#if defined(__aarch64__)
- acc += wrapper::vaddv(acc_q32_0);
- acc += wrapper::vaddv(acc_q32_1);
- acc += wrapper::vaddv(acc_q32_2);
- acc += wrapper::vaddv(acc_q32_3);
+ acc += wrapper::vaddv(acc_q32_0);
+ acc += wrapper::vaddv(acc_q32_1);
+ acc += wrapper::vaddv(acc_q32_2);
+ acc += wrapper::vaddv(acc_q32_3);
#else // __aarch64__
- auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0));
- temp = wrapper::vpadd(temp, temp);
- acc += wrapper::vgetlane(temp, 0);
+ auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0));
+ temp = wrapper::vpadd(temp, temp);
+ acc += wrapper::vgetlane(temp, 0);
- temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1));
- temp = wrapper::vpadd(temp, temp);
- acc += wrapper::vgetlane(temp, 0);
+ temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1));
+ temp = wrapper::vpadd(temp, temp);
+ acc += wrapper::vgetlane(temp, 0);
- temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2));
- temp = wrapper::vpadd(temp, temp);
- acc += wrapper::vgetlane(temp, 0);
+ temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2));
+ temp = wrapper::vpadd(temp, temp);
+ acc += wrapper::vgetlane(temp, 0);
- temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3));
- temp = wrapper::vpadd(temp, temp);
- acc += wrapper::vgetlane(temp, 0);
+ temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3));
+ temp = wrapper::vpadd(temp, temp);
+ acc += wrapper::vgetlane(temp, 0);
#endif // __aarch64__
- for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
- {
- const auto src_val = *(in_ptr_mover) + input_offset;
- const auto w_val = *(weights_ptr_mover) + weights_offset;
- acc += src_val * w_val;
+ for (; index_c_in < index_c_in_end;
+ ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+ {
+ const auto src_val = *(in_ptr_mover) + input_offset;
+ const auto w_val = *(weights_ptr_mover) + weights_offset;
+ acc += src_val * w_val;
+ }
+ }
}
}
- }
- }
- if(biases)
- {
- acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]);
- }
+ if (biases)
+ {
+ acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]);
+ }
- T out_val = finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false);
- *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val;
+ T out_val =
+ finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false);
+ *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val;
+ },
+ wei);
},
- wei);
- },
- out);
+ out);
}
} // namespace cpu
} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H \ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
diff --git a/src/cpu/kernels/crop/generic/neon/crop_helper.h b/src/cpu/kernels/crop/generic/neon/crop_helper.h
index 1fe8e11e98..8fb7ad2087 100644
--- a/src/cpu/kernels/crop/generic/neon/crop_helper.h
+++ b/src/cpu/kernels/crop/generic/neon/crop_helper.h
@@ -80,7 +80,7 @@ inline float32x4_t load_as_f32(uint8_t *ptr)
{
return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
}
-}
+} // namespace cpu
} // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H \ No newline at end of file
+#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
diff --git a/src/cpu/kernels/crop/generic/neon/fp16.cpp b/src/cpu/kernels/crop/generic/neon/fp16.cpp
index 218ebba191..3739c9d4e0 100644
--- a/src/cpu/kernels/crop/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/crop/generic/neon/fp16.cpp
@@ -29,12 +29,19 @@ namespace arm_compute
{
namespace cpu
{
-void fp16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void fp16_in_bounds_crop_window(const ITensor *input,
+ const ITensor *output,
+ float *output_ptr,
+ Coordinates input_offset,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
- return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset,
- window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
-}
+ return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+ output_width_limit, input_has_single_channel, is_width_flipped);
}
+} // namespace cpu
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/crop/generic/neon/fp32.cpp b/src/cpu/kernels/crop/generic/neon/fp32.cpp
index 16d0218fce..f665c3652c 100644
--- a/src/cpu/kernels/crop/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/crop/generic/neon/fp32.cpp
@@ -28,11 +28,18 @@ namespace arm_compute
{
namespace cpu
{
-void fp32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void fp32_in_bounds_crop_window(const ITensor *input,
+ const ITensor *output,
+ float *output_ptr,
+ Coordinates input_offset,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
- return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset,
- window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
-}
+ return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+ output_width_limit, input_has_single_channel, is_width_flipped);
}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/crop/generic/neon/impl.h b/src/cpu/kernels/crop/generic/neon/impl.h
index a59588be45..b90ba9ddbf 100644
--- a/src/cpu/kernels/crop/generic/neon/impl.h
+++ b/src/cpu/kernels/crop/generic/neon/impl.h
@@ -26,8 +26,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/crop/generic/neon/crop_helper.h"
namespace arm_compute
@@ -35,19 +36,26 @@ namespace arm_compute
namespace cpu
{
template <typename T>
-void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void in_bounds_crop_window(const ITensor *input,
+ const ITensor *output,
+ float *output_ptr,
+ Coordinates input_offset,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
// Reverse elements if width flipped.
- if(is_width_flipped)
+ if (is_width_flipped)
{
// Collapse first dimension if possible.
- if(input_has_single_channel)
+ if (input_has_single_channel)
{
int32_t x = output_width_start;
Coordinates negative_offset(input_offset);
negative_offset.set(1, negative_offset[1] - window_step_x + 1);
- for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
+ for (; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
{
auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
@@ -57,25 +65,27 @@ void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *o
wrapper::vstore(output_ptr + x, in);
}
input_offset[1] = negative_offset[1] + window_step_x - 1;
- for(; x < output_width_limit; ++x, --input_offset[1])
+ for (; x < output_width_limit; ++x, --input_offset[1])
{
*(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
}
}
else
{
- for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
+ for (int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
{
input_offset.set(0, 0);
int32_t c = 0;
- for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
+ for (; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x;
+ c += window_step_x, input_offset[0] += window_step_x)
{
auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
}
- for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
+ for (; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
{
- *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+ *(output_ptr + x * output->info()->dimension(0) + c) =
+ static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
}
}
}
@@ -83,25 +93,28 @@ void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *o
else
{
// Use memcpy if the elements don't need converting to float.
- if(std::is_same<T, float>::value)
+ if (std::is_same<T, float>::value)
{
memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
- (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
+ (output_width_limit - output_width_start) * output->info()->dimension(0) *
+ output->info()->element_size());
}
else
{
- int32_t x = 0;
- int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+ int32_t x = 0;
+ int32_t limit =
+ (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
- for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
+ for (; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
{
auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
wrapper::vstore(output_start_ptr + x, in);
}
- for(; x < limit; ++x, ++input_offset[0])
+ for (; x < limit; ++x, ++input_offset[0])
{
- *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+ *(output_start_ptr + x) =
+ static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
}
}
}
diff --git a/src/cpu/kernels/crop/generic/neon/integer.cpp b/src/cpu/kernels/crop/generic/neon/integer.cpp
index ebf2c1fbd3..602434f54f 100644
--- a/src/cpu/kernels/crop/generic/neon/integer.cpp
+++ b/src/cpu/kernels/crop/generic/neon/integer.cpp
@@ -29,46 +29,88 @@ namespace arm_compute
{
namespace cpu
{
-void u8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u8_in_bounds_crop_window(const ITensor *input,
+ const ITensor *output,
+ float *output_ptr,
+ Coordinates input_offset,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
- return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset,
- window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+ return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+ output_width_limit, input_has_single_channel, is_width_flipped);
}
-void u16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u16_in_bounds_crop_window(const ITensor *input,
+ const ITensor *output,
+ float *output_ptr,
+ Coordinates input_offset,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
- return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset,
- window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+ return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+ output_width_limit, input_has_single_channel, is_width_flipped);
}
-void u32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u32_in_bounds_crop_window(const ITensor *input,
+ const ITensor *output,
+ float *output_ptr,
+ Coordinates input_offset,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
- return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset,
- window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+ return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+ output_width_limit, input_has_single_channel, is_width_flipped);
}
-void s8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s8_in_bounds_crop_window(const ITensor *input,
+ const ITensor *output,
+ float *output_ptr,
+ Coordinates input_offset,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
- return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset,
- window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+ return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+ output_width_limit, input_has_single_channel, is_width_flipped);
}
-void s16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s16_in_bounds_crop_window(const ITensor *input,
+ const ITensor *output,
+ float *output_ptr,
+ Coordinates input_offset,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
- return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset,
- window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+ return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+ output_width_limit, input_has_single_channel, is_width_flipped);
}
-void s32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s32_in_bounds_crop_window(const ITensor *input,
+ const ITensor *output,
+ float *output_ptr,
+ Coordinates input_offset,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
- return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset,
- window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
-}
+ return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+ output_width_limit, input_has_single_channel, is_width_flipped);
}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/crop/list.h b/src/cpu/kernels/crop/list.h
index a6b83215ae..9cb7726203 100644
--- a/src/cpu/kernels/crop/list.h
+++ b/src/cpu/kernels/crop/list.h
@@ -26,8 +26,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/crop/generic/neon/impl.h"
namespace arm_compute
@@ -36,7 +37,8 @@ namespace cpu
{
#define DECLARE_CROP_KERNEL(func_name) \
void func_name(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, \
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+ int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, \
+ bool input_has_single_channel, bool is_width_flipped)
DECLARE_CROP_KERNEL(fp16_in_bounds_crop_window);
DECLARE_CROP_KERNEL(fp32_in_bounds_crop_window);
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
index e85a1664ea..293e606d81 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
@@ -29,11 +29,16 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp16_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_fp16_deptwiseconv2dnative(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
return run_depthwise_float<float16_t, float16_t>(src, weights, bias, dst, window, has_biases, info);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
index b2333a3334..c6fa4790b7 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
@@ -26,10 +26,15 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp32_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_fp32_deptwiseconv2dnative(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
return run_depthwise_float<float, float>(src, weights, bias, dst, window, has_biases, info);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
index a2ae5564e6..d08e973968 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/function_info/ConvolutionInfo.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -65,8 +67,16 @@ inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
namespace
{
template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_multiplier1_quantized(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ std::vector<int> output_multiplier,
+ std::vector<int> output_shift,
+ const Window &window,
+ bool has_biases) // NOLINT
{
ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
constexpr auto element_per_vector = vector_size / sizeof(T);
@@ -75,7 +85,8 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei
using AccType = int32_t;
using AccArrayType = std::array<AccType, element_per_vector>;
- const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+ const auto out_of_bound_value =
+ PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
@@ -104,152 +115,175 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- auto const base_weights_ptr = weights_it.ptr();
- size_t x = run_info.x_start;
-
- for(; x < run_info.x_leftover_start; x += run_info.x_step)
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
{
- AccArrayType acc{};
- AccArrayType in_sum{};
- AccArrayType we_sum{};
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+ auto const base_weights_ptr = weights_it.ptr();
+ size_t x = run_info.x_start;
- auto weights_ptr = base_weights_ptr;
- auto input_offset = base_input_offset;
-
- for(size_t h = 0; h < run_info.weights_height; ++h)
+ for (; x < run_info.x_leftover_start; x += run_info.x_step)
{
- int64_t offs = input_offset + x * sizeof(T);
- for(size_t w = 0; w < run_info.weights_width; ++w)
- {
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_vals = is_valid_region ?
- wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
- out_of_bound_vector;
- const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+ AccArrayType acc{};
+ AccArrayType in_sum{};
+ AccArrayType we_sum{};
+
+ auto weights_ptr = base_weights_ptr;
+ auto input_offset = base_input_offset;
- for(size_t i = 0; i < element_per_vector; ++i)
+ for (size_t h = 0; h < run_info.weights_height; ++h)
+ {
+ int64_t offs = input_offset + x * sizeof(T);
+ for (size_t w = 0; w < run_info.weights_width; ++w)
{
- acc.at(i) += input_vals[i] * weights_vals[i];
- in_sum.at(i) += input_vals[i];
- we_sum.at(i) += weights_vals[i];
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_vals =
+ is_valid_region
+ ? wrapper::vload(reinterpret_cast<T *>(
+ input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+ : out_of_bound_vector;
+ const auto weights_vals =
+ wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+ for (size_t i = 0; i < element_per_vector; ++i)
+ {
+ acc.at(i) += input_vals[i] * weights_vals[i];
+ in_sum.at(i) += input_vals[i];
+ we_sum.at(i) += weights_vals[i];
+ }
+
+ offs += dilation.x() * run_info.input_stride_y;
}
- offs += dilation.x() * run_info.input_stride_y;
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
+ VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+ for (size_t i = 0; i < element_per_vector; ++i)
+ {
+ acc.at(i) -= in_sum.at(i) * weights_qoffset;
+ acc.at(i) -= we_sum.at(i) * input_qoffset;
+ acc.at(i) += k_offset;
- VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
- for(size_t i = 0; i < element_per_vector; ++i)
- {
- acc.at(i) -= in_sum.at(i) * weights_qoffset;
- acc.at(i) -= we_sum.at(i) * input_qoffset;
- acc.at(i) += k_offset;
+ if (has_biases)
+ {
+ acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+ }
- if(has_biases)
- {
- acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+ const int32_t out_mul = output_multiplier.at(x + i);
+ const int32_t out_shift = output_shift.at(x + i);
+ if (out_shift < 0)
+ {
+ acc.at(i) =
+ saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+ }
+ else
+ {
+ acc.at(i) =
+ rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) +
+ output_qoffset;
+ }
+ out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
}
- const int32_t out_mul = output_multiplier.at(x + i);
- const int32_t out_shift = output_shift.at(x + i);
- if(out_shift < 0)
- {
- acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
- }
- else
- {
- acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
- }
- out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
+ wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
}
- wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
- }
-
- // left-over
- for(; x < run_info.x_end; ++x)
- {
- AccType acc = 0;
- AccType in_sum = 0;
- AccType we_sum = 0;
+ // left-over
+ for (; x < run_info.x_end; ++x)
+ {
+ AccType acc = 0;
+ AccType in_sum = 0;
+ AccType we_sum = 0;
- auto weights_ptr = base_weights_ptr;
- auto input_offset = base_input_offset;
+ auto weights_ptr = base_weights_ptr;
+ auto input_offset = base_input_offset;
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int64_t offs = input_offset + x * sizeof(T);
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_val = is_valid_region ?
- *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
- out_of_bound_value;
- const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
- acc += input_val * weights_val;
- in_sum += input_val;
- we_sum += weights_val;
+ int64_t offs = input_offset + x * sizeof(T);
+ for (size_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_val =
+ is_valid_region
+ ? *reinterpret_cast<T *>(input_it.ptr() +
+ std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+ : out_of_bound_value;
+ const auto weights_val =
+ *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+ acc += input_val * weights_val;
+ in_sum += input_val;
+ we_sum += weights_val;
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
- offs += dilation.x() * run_info.input_stride_y;
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
+ T out_vals{0};
- T out_vals{ 0 };
+ acc -= in_sum * weights_qoffset;
+ acc -= we_sum * input_qoffset;
+ acc += k_offset;
- acc -= in_sum * weights_qoffset;
- acc -= we_sum * input_qoffset;
- acc += k_offset;
+ if (has_biases)
+ {
+ acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
+ }
- if(has_biases)
- {
- acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
- }
+ const int32_t out_mul = output_multiplier.at(x);
+ const int32_t out_shift = output_shift.at(x);
- const int32_t out_mul = output_multiplier.at(x);
- const int32_t out_shift = output_shift.at(x);
+ if (out_shift < 0)
+ {
+ acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
+ }
+ else
+ {
+ acc =
+ rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+ }
- if(out_shift < 0)
- {
- acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
- }
- else
- {
- acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+ out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
+ *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
}
-
- out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
- *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_generic_quantized(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier,
+ std::vector<int> output_multiplier,
+ std::vector<int> output_shift,
+ const Window &window,
+ bool has_biases) // NOLINT
{
using AccType = int32_t;
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+ const auto run_info =
+ DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
- const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+ const auto out_of_bound_value =
+ PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
@@ -277,76 +311,93 @@ void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- std::vector<AccType> acc(depth_multiplier, 0);
- std::vector<AccType> we_sum(depth_multiplier, 0);
- AccType in_sum = 0;
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
+ {
+ std::vector<AccType> acc(depth_multiplier, 0);
+ std::vector<AccType> we_sum(depth_multiplier, 0);
+ AccType in_sum = 0;
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int offs = input_offset;
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ auto weights_ptr = weights_it.ptr();
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
- for(size_t m = 0; m < depth_multiplier; ++m)
+ int offs = input_offset;
+ for (size_t w = 0; w < run_info.weights_width; ++w)
{
- const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
- acc.at(m) += input_val * weights_val;
-
- we_sum.at(m) += weights_val;
- }
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_val =
+ is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+ run_info.input_max_offset)))
+ : out_of_bound_value;
- offs += dilation.x() * run_info.input_stride_y;
- in_sum += input_val;
- }
+ for (size_t m = 0; m < depth_multiplier; ++m)
+ {
+ const auto weights_val =
+ *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+ acc.at(m) += input_val * weights_val;
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
+ we_sum.at(m) += weights_val;
+ }
- for(size_t m = 0; m < depth_multiplier; ++m)
- {
- acc.at(m) -= in_sum * weights_qoffset;
- acc.at(m) -= we_sum.at(m) * input_qoffset;
- acc.at(m) += k_offset;
+ offs += dilation.x() * run_info.input_stride_y;
+ in_sum += input_val;
+ }
- if(has_biases)
- {
- acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
- const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
- if(out_shift < 0)
- {
- acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
- }
- else
+ for (size_t m = 0; m < depth_multiplier; ++m)
{
- acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
+ acc.at(m) -= in_sum * weights_qoffset;
+ acc.at(m) -= we_sum.at(m) * input_qoffset;
+ acc.at(m) += k_offset;
+
+ if (has_biases)
+ {
+ acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+ }
+
+ const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
+ const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
+ if (out_shift < 0)
+ {
+ acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
+ }
+ else
+ {
+ acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) +
+ output_qoffset;
+ }
+ *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) =
+ static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
}
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier,
+ std::vector<int> output_multiplier,
+ std::vector<int> output_shift,
+ const Window &window,
+ bool has_biases) // NOLINT
{
constexpr int half_vec = vector_size / 2;
@@ -355,11 +406,15 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+ const auto run_info =
+ DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
- const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
- const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
- const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+ const auto input_qoffset_vec = wrapper::vreinterpret(
+ wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
+ const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(
+ wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
+ const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset,
+ arm_compute::wrapper::traits::vector_128_tag{});
const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
@@ -389,7 +444,7 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
@@ -397,95 +452,117 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- std::fill(begin(acc0), end(acc0), zero);
- std::fill(begin(acc1), end(acc1), zero);
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
+ {
+ std::fill(begin(acc0), end(acc0), zero);
+ std::fill(begin(acc1), end(acc1), zero);
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- const int32_t current_h = input_z + h * dilation.y();
- if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
+ auto weights_ptr = weights_it.ptr();
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- int offs = input_offset;
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ const int32_t current_h = input_z + h * dilation.y();
+ if (current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
{
- const int32_t current_w = input_y + w * dilation.x();
- if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
+ int offs = input_offset;
+ for (size_t w = 0; w < run_info.weights_width; ++w)
{
- const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
- const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
- const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
- for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+ const int32_t current_w = input_y + w * dilation.x();
+ if (current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
{
- const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
- const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
- const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
-
- acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
- acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
+ const auto input_8x8 = wrapper::vdup_n(
+ *(reinterpret_cast<T *>(
+ input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))),
+ TagType{});
+ const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
+ const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
+
+ for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+ {
+ const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(
+ weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+ const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
+ const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
+
+ acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs),
+ wrapper::vgetlow(weights_no_offs));
+ acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs),
+ wrapper::vgethigh(weights_no_offs));
+ }
}
- }
- offs += dilation.x() * run_info.input_stride_y;
+ offs += dilation.x() * run_info.input_stride_y;
+ }
}
- }
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
+ }
- for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
- {
- if(has_biases)
+ for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
{
- const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
- const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
+ if (has_biases)
+ {
+ const auto bias_val0 =
+ wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+ const auto bias_val1 = wrapper::vloadq(
+ reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
- acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
- acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
- }
+ acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
+ acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
+ }
- if(out_shift < 0)
- {
- acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
- acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
- }
- else
- {
- acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
- acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
- }
+ if (out_shift < 0)
+ {
+ acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul),
+ output_qoffset_vec);
+ acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul),
+ output_qoffset_vec);
+ }
+ else
+ {
+ acc0.at(i) = wrapper::vadd(
+ rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift),
+ output_qoffset_vec);
+ acc1.at(i) = wrapper::vadd(
+ rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift),
+ output_qoffset_vec);
+ }
- acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
- acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
+ acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
+ acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
- const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
- wrapper::vmovn(acc1.at(i)));
+ const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), wrapper::vmovn(acc1.at(i)));
- if(std::is_same<T, uint8_t>::value)
- {
- wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
- }
- else
- {
- wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
+ if (std::is_same<T, uint8_t>::value)
+ {
+ wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)),
+ wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
+ }
+ else
+ {
+ wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)),
+ wrapper::vqmovn(out_val));
+ }
}
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
} // namespace
template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void run_depthwise_quanitized8bit(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
PadStrideInfo conv_info = info.pad_stride_info;
unsigned int depth_multiplier = info.depth_multiplier;
@@ -497,15 +574,15 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co
const auto output_scale = dst->info()->quantization_info().uniform().scale;
auto weights_scale = weights->info()->quantization_info().scale();
- if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+ if (!is_data_type_quantized_per_channel(weights->info()->data_type()))
{
- for(size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
+ for (size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
{
weights_scale.push_back(weights_scale.front());
}
}
- for(const auto &s : weights_scale)
+ for (const auto &s : weights_scale)
{
int32_t out_mult = 0;
int32_t out_shift = 0;
@@ -516,30 +593,49 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co
output_shift.push_back(out_shift);
}
- if(depth_multiplier == 1)
+ if (depth_multiplier == 1)
{
- depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, output_shift, window, has_biases);
+ depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier,
+ output_shift, window, has_biases);
}
else
{
const bool is_pow2 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
- if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
+ if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
{
- depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+ depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation,
+ depth_multiplier, output_multiplier, output_shift, window,
+ has_biases);
}
else
{
- depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+ depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier,
+ output_multiplier, output_shift, window, has_biases);
}
}
}
-template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
index 8410cdbf16..3fa5c58c3c 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
@@ -24,6 +24,7 @@
#ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
#define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -63,15 +64,21 @@ struct DepthwiseConvolutionRunInfo
const size_t input_width;
const size_t input_depth;
- DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
- : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+ DepthwiseConvolutionRunInfo(const ITensorInfo &input,
+ const ITensorInfo &weights,
+ const PadStrideInfo &conv_info,
+ const Window &w,
+ uint32_t depth_multiplier = 1) // NOLINT
+ : num_read_elements_per_iteration(
+ (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
x_start(w.x().start()),
x_end(w.x().end()),
x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
input_stride_y(input.strides_in_bytes().y()),
input_stride_z(input.strides_in_bytes().z()),
- input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+ input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) -
+ (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
weights_width(weights.dimension(width_idx)),
weights_height(weights.dimension(height_idx)),
weights_stride_y(weights.strides_in_bytes().y()),
@@ -87,7 +94,12 @@ struct DepthwiseConvolutionRunInfo
}
};
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
+inline bool is_valid_input_region(int32_t base_w,
+ uint32_t base_h,
+ uint32_t w,
+ uint32_t h,
+ const DepthwiseConvolutionRunInfo &run_info,
+ const Size2D &dilation)
{
const int32_t current_h = base_h + h * dilation.y();
const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
@@ -99,8 +111,14 @@ inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, u
}
template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, const Window &window, bool has_biases)
+void depthwise_loop_multiplier1_fp(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ const Window &window,
+ bool has_biases)
{
constexpr auto element_per_vector = vector_size / sizeof(T);
using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
@@ -129,94 +147,112 @@ void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, c
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
- auto const base_weights_ptr = weights_it.ptr();
- uint32_t x = run_info.x_start;
-
- for(; x < run_info.x_leftover_start; x += run_info.x_step)
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
{
- VectorType acc = zero_vector;
- auto weights_ptr = base_weights_ptr;
- int64_t input_offset = base_input_offset;
+ const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+ auto const base_weights_ptr = weights_it.ptr();
+ uint32_t x = run_info.x_start;
- for(uint32_t h = 0; h < run_info.weights_height; ++h)
+ for (; x < run_info.x_leftover_start; x += run_info.x_step)
{
- int64_t offs = input_offset + x * sizeof(T);
- for(uint32_t w = 0; w < run_info.weights_width; ++w)
+ VectorType acc = zero_vector;
+ auto weights_ptr = base_weights_ptr;
+ int64_t input_offset = base_input_offset;
+
+ for (uint32_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_vals = is_valid_region ?
- wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
- zero_vector;
- const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
- acc = wrapper::vmla(acc, weights_vals, input_vals);
+ int64_t offs = input_offset + x * sizeof(T);
+ for (uint32_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_vals =
+ is_valid_region
+ ? wrapper::vload(reinterpret_cast<T *>(
+ input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+ : zero_vector;
+ const auto weights_vals =
+ wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+ acc = wrapper::vmla(acc, weights_vals, input_vals);
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
+
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
+ }
- offs += dilation.x() * run_info.input_stride_y;
+ if (has_biases)
+ {
+ const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+ acc = wrapper::vadd(acc, biases_vals);
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
+ wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
}
- if(has_biases)
+ for (; x < run_info.x_end; ++x)
{
- const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
- acc = wrapper::vadd(acc, biases_vals);
- }
-
- wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
- }
+ auto acc_scalar = T{0};
+ auto weights_ptr = base_weights_ptr;
+ int64_t input_offset = base_input_offset;
- for(; x < run_info.x_end; ++x)
- {
- auto acc_scalar = T{ 0 };
- auto weights_ptr = base_weights_ptr;
- int64_t input_offset = base_input_offset;
-
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int64_t offs = input_offset + x * sizeof(T);
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
- const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
- acc_scalar += (input_vals * weights_vals);
-
- offs += dilation.x() * run_info.input_stride_y;
+ int64_t offs = input_offset + x * sizeof(T);
+ for (size_t w = 0; w < run_info.weights_width; ++w)
+ {
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_vals =
+ is_valid_region
+ ? *reinterpret_cast<T *>(input_it.ptr() +
+ std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+ : 0;
+ const auto weights_vals =
+ *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+ acc_scalar += (input_vals * weights_vals);
+
+ offs += dilation.x() * run_info.input_stride_y;
+ }
+
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- if(has_biases)
- {
- const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
- acc_scalar += biases_vals;
+ if (has_biases)
+ {
+ const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+ acc_scalar += biases_vals;
+ }
+ *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
}
- *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
template <typename T>
-void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
+void depthwise_loop_generic_fp(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ unsigned int depth_multiplier,
+ const Window &window,
+ bool has_biases)
{
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+ const auto run_info =
+ DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
Window execution_window = window;
execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
@@ -240,81 +276,98 @@ void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const
Iterator output_it(dst, win_output);
Iterator biases_it{};
- if(has_biases)
+ if (has_biases)
{
biases_it = Iterator(biases, win_weights);
}
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- std::vector<T> acc(depth_multiplier, static_cast<T>(0));
+ execute_window_loop(
+ execution_window,
+ [&](const Coordinates &id)
+ {
+ std::vector<T> acc(depth_multiplier, static_cast<T>(0));
- const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+ const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+ const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+ int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int offs = input_offset;
- for(size_t w = 0; w < run_info.weights_width; ++w)
+ auto weights_ptr = weights_it.ptr();
+ for (size_t h = 0; h < run_info.weights_height; ++h)
{
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
-
- for(size_t m = 0; m < depth_multiplier; ++m)
+ int offs = input_offset;
+ for (size_t w = 0; w < run_info.weights_width; ++w)
{
- const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
- acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
+ const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+ const auto input_val =
+ is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+ run_info.input_max_offset)))
+ : T(0);
+
+ for (size_t m = 0; m < depth_multiplier; ++m)
+ {
+ const auto weights_val =
+ *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+ acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
+ }
+
+ offs += dilation.x() * run_info.input_stride_y;
}
- offs += dilation.x() * run_info.input_stride_y;
+ weights_ptr += run_info.weights_stride_z;
+ input_offset += dilation.y() * run_info.input_stride_z;
}
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- if(has_biases)
- {
- for(size_t m = 0; m < depth_multiplier; ++m)
+ if (has_biases)
{
- const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+ for (size_t m = 0; m < depth_multiplier; ++m)
+ {
+ const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
+ *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+ }
}
- }
- else
- {
- for(size_t m = 0; m < depth_multiplier; ++m)
+ else
{
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+ for (size_t m = 0; m < depth_multiplier; ++m)
+ {
+ *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+ }
}
- }
- },
- input_it, weights_it, biases_it, output_it);
+ },
+ input_it, weights_it, biases_it, output_it);
}
template <typename T, typename TW>
-void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void run_depthwise_float(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
PadStrideInfo conv_info = info.pad_stride_info;
unsigned int depth_multiplier = info.depth_multiplier;
Size2D dilation = info.dilation;
- if(depth_multiplier == 1)
+ if (depth_multiplier == 1)
{
depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases);
}
else
{
- depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases);
+ depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window,
+ has_biases);
}
}
template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+void run_depthwise_quanitized8bit(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
index 1bf7ad7007..d32847c1e8 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
@@ -26,16 +26,26 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qu8_deptwiseconv2dnative(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
return run_depthwise_quanitized8bit<uint8_t, uint8_t>(src, weights, bias, dst, window, has_biases, info);
}
-void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
return run_depthwise_quanitized8bit<uint8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
index 58f7536064..682fad0bda 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
@@ -26,16 +26,26 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qs8_deptwiseconv2dnative(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
}
-void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *dst,
+ const Window &window,
+ bool has_biases,
+ const ConvolutionInfo &info)
{
return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/list.h b/src/cpu/kernels/depthwiseconv2d/list.h
index 44f055d6a9..cf80608f4f 100644
--- a/src/cpu/kernels/depthwiseconv2d/list.h
+++ b/src/cpu/kernels/depthwiseconv2d/list.h
@@ -27,9 +27,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name) \
- void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, \
- ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name) \
+ void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, \
+ const Window &window, bool has_biases, const ConvolutionInfo &info)
DECLARE_DEPTHWISECONV2D_KERNEL(neon_qu8_deptwiseconv2dnative);
DECLARE_DEPTHWISECONV2D_KERNEL(neon_qs8_deptwiseconv2dnative);
DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp16_deptwiseconv2dnative);
diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h
index 9a0472643d..5cbf7a36c6 100644
--- a/src/cpu/kernels/directconv2d/list.h
+++ b/src/cpu/kernels/directconv2d/list.h
@@ -32,8 +32,9 @@ namespace cpu
{
namespace kernels
{
-#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \
- void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \
+ void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, \
+ const PadStrideInfo &conv_info)
DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d);
DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d);
diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp
index a719fa50d6..218a4b7ee4 100644
--- a/src/cpu/kernels/directconv2d/nchw/all.cpp
+++ b/src/cpu/kernels/directconv2d/nchw/all.cpp
@@ -22,18 +22,17 @@
* SOFTWARE.
*/
-#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
-
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
#include <algorithm>
@@ -44,22 +43,26 @@ namespace cpu
namespace kernels
{
template <typename T>
-void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+void convolve_nchw(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-void neon_fp16_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp16_nchw_directconv2d(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
convolve_nchw<float16_t>(window, src, weights, dst, conv_info);
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-void neon_fp32_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp32_nchw_directconv2d(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
convolve_nchw<float>(window, src, weights, dst, conv_info);
}
template <typename T>
-void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void convolve_nchw(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_UNUSED(conv_info);
@@ -107,72 +110,81 @@ void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weig
constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // We are computing the theoretical starting input starting points
- const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
-
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
-
- // We use the input points to select the valid weight points to use
- const int wei_w_start = in_w_start - in_w_start_t;
- const int wei_h_start = in_h_start - in_h_start_t;
- const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
-
- const int index_c_end = weights->info()->dimension(2);
- const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
- execute_window_loop(window_w, [&](const Coordinates & id_w)
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
{
- const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
- uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
- T out_temp = static_cast<T>(0);
-
- for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
- {
- const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c;
- const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
- for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+ // We are computing the theoretical starting input starting points
+ const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
+ const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
+ const int in_w_end_t = in_w_start_t + kernel_dim_w;
+ const int in_h_end_t = in_h_start_t + kernel_dim_h;
+
+ // We are computing the valid initial and ending input points by checking the borders
+ const int in_w_start = std::max(in_w_start_t, 0);
+ const int in_h_start = std::max(in_h_start_t, 0);
+ const int in_w_end = std::min(in_w_end_t, input_dim_w);
+ const int in_h_end = std::min(in_h_end_t, input_dim_h);
+
+ // We use the input points to select the valid weight points to use
+ const int wei_w_start = in_w_start - in_w_start_t;
+ const int wei_h_start = in_h_start - in_h_start_t;
+ const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
+
+ const int index_c_end = weights->info()->dimension(2);
+ const T *const in_ptr_start =
+ reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+ id[3] * input_stride_n;
+ execute_window_loop(
+ window_w,
+ [&](const Coordinates &id_w)
{
- const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h;
- const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
- int index_w = in_w_start;
- int index_wei_w = wei_w_start;
- vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
- {
- const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
- const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
- out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
- }
- out_temp += vreduce(out_temp_vec);
- for(; index_w < in_w_end; ++index_w, ++index_wei_w)
+ const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+ uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
+ T out_temp = static_cast<T>(0);
+
+ for (int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
{
- const auto src_val = *(in_ptr_row + index_w * input_stride_w);
- const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w);
- out_temp += src_val * w_val;
+ const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c;
+ const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
+ for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+ ++index_wei_h, ++index_in_h)
+ {
+ const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h;
+ const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
+ int index_w = in_w_start;
+ int index_wei_w = wei_w_start;
+ vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+ for (; index_w <= ((in_w_end - num_elems_read_per_iteration));
+ index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
+ {
+ const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
+ const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
+ out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+ }
+ out_temp += vreduce(out_temp_vec);
+ for (; index_w < in_w_end; ++index_w, ++index_wei_w)
+ {
+ const auto src_val = *(in_ptr_row + index_w * input_stride_w);
+ const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w);
+ out_temp += src_val * w_val;
+ }
+ }
}
- }
- }
- *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-
+ *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+ },
+ wei);
},
- wei);
- },
- out);
+ out);
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template void convolve_nchw<float16_t>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nchw<float16_t>(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-template void convolve_nchw<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nchw<float>(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
index 9982431de5..36a8e76f13 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
@@ -30,10 +30,11 @@ namespace cpu
{
namespace kernels
{
-void neon_fp32_nhwc_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp32_nhwc_directconv2d(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
convolve_nhwc<float>(window, src, weights, dst, conv_info);
}
} // namespace kernels
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
index 500ad1b420..f235167e28 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
@@ -24,16 +24,16 @@
#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IAccessWindow.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <algorithm>
@@ -49,12 +49,14 @@ namespace
{
bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
{
- return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
-}
+ return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 &&
+ weights->padding().right == 0);
}
+} // namespace
template <typename T>
-void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void convolve_nhwc(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
{
// Declare useful types
using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -97,7 +99,7 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig
constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
// nhwc optimized
- if(have_zero_x_internal_padding(src->info(), weights->info()))
+ if (have_zero_x_internal_padding(src->info(), weights->info()))
{
// This function assumes that input and weights have not padding in channel
@@ -114,138 +116,154 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig
* multiplication works on the correct input/weight elements.
*/
execute_window_loop(
- window_out, [&](const Coordinates & id)
- {
- /*
+ window_out,
+ [&](const Coordinates &id)
+ {
+ /*
* In here we create theoretical indexes which then we validate for both
* inputs and weights.
* As a reminder, this loop take each output point in NHW, C is treated
* in the weights loop.
*/
- // We are computing the theoretical starting input starting points
- const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
-
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
-
- // We use the input points to select the valid weight points to use
- const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
- const int index_h_start = in_h_start - in_h_start_t;
- const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
- const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
-
- execute_window_loop(
- window_w, [&](const Coordinates & id_w)
- {
- /*
+ // We are computing the theoretical starting input starting points
+ const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+ const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+ const int in_w_end_t = in_w_start_t + kernel_dim_w;
+ const int in_h_end_t = in_h_start_t + kernel_dim_h;
+
+ // We are computing the valid initial and ending input points by checking the borders
+ const int in_w_start = std::max(in_w_start_t, 0);
+ const int in_h_start = std::max(in_h_start_t, 0);
+ const int in_w_end = std::min(in_w_end_t, input_dim_w);
+ const int in_h_end = std::min(in_h_end_t, input_dim_h);
+
+ // We use the input points to select the valid weight points to use
+ const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
+ const int index_h_start = in_h_start - in_h_start_t;
+ const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
+ const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
+
+ execute_window_loop(
+ window_w,
+ [&](const Coordinates &id_w)
+ {
+ /*
* This is the loop in the weights, and it goes along N (the batches)
* As a reminder, the batches of the weights are translated into the
* channels of the output
*/
- const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
- + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
- const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
- uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
-
- T out_temp = static_cast<T>(0);
- for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
- {
- const T *in_ptr_mover = in_ptr_row;
- int index_wc = index_wc_start;
- vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
- {
- const auto src_vec = wrapper::vloadq(in_ptr_mover);
- const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc);
- out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
- }
- out_temp += vreduce(out_temp_vec);
- for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
- {
- const auto src_val = *(in_ptr_mover);
- const auto w_val = *(weights_ptr_row + index_wc);
- out_temp += src_val * w_val;
- }
- }
- *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+ const T *in_ptr_row =
+ reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+ id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
+ const T *weights_ptr_row =
+ reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
+ uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
+
+ T out_temp = static_cast<T>(0);
+ for (int index_h = index_h_start; index_h < index_h_end;
+ ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
+ {
+ const T *in_ptr_mover = in_ptr_row;
+ int index_wc = index_wc_start;
+ vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+ for (; index_wc <= index_wc_end - num_elems_read_per_iteration;
+ index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+ {
+ const auto src_vec = wrapper::vloadq(in_ptr_mover);
+ const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc);
+ out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+ }
+ out_temp += vreduce(out_temp_vec);
+ for (; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
+ {
+ const auto src_val = *(in_ptr_mover);
+ const auto w_val = *(weights_ptr_row + index_wc);
+ out_temp += src_val * w_val;
+ }
+ }
+ *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+ },
+ wei);
},
- wei);
- },
- out);
+ out);
}
else // nhwc non optimized
{
execute_window_loop(
- window_out, [&](const Coordinates & id)
- {
- // We are computing the theoretical starting input starting points
- const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
-
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
-
- // We use the input points to select the valid weight points to use
- const int wei_w_start = in_w_start - in_w_start_t;
- const int wei_h_start = in_h_start - in_h_start_t;
- const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
- const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
-
- const int index_c_end = weights->info()->dimension(0);
- const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
- execute_window_loop(
- window_w, [&](const Coordinates & id_w)
+ window_out,
+ [&](const Coordinates &id)
{
- const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
- uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
-
- T out_temp = static_cast<T>(0);
- for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
- {
- const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;
- const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
- for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+ // We are computing the theoretical starting input starting points
+ const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+ const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+ const int in_w_end_t = in_w_start_t + kernel_dim_w;
+ const int in_h_end_t = in_h_start_t + kernel_dim_h;
+
+ // We are computing the valid initial and ending input points by checking the borders
+ const int in_w_start = std::max(in_w_start_t, 0);
+ const int in_h_start = std::max(in_h_start_t, 0);
+ const int in_w_end = std::min(in_w_end_t, input_dim_w);
+ const int in_h_end = std::min(in_h_end_t, input_dim_h);
+
+ // We use the input points to select the valid weight points to use
+ const int wei_w_start = in_w_start - in_w_start_t;
+ const int wei_h_start = in_h_start - in_h_start_t;
+ const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
+ const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
+
+ const int index_c_end = weights->info()->dimension(0);
+ const T *const in_ptr_start =
+ reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+ id[3] * input_stride_n;
+
+ execute_window_loop(
+ window_w,
+ [&](const Coordinates &id_w)
{
- const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
- const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
- int index_c = 0;
- vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
- {
- const auto src_vec = wrapper::vloadq(in_ptr_mover);
- const auto w_vec = wrapper::vloadq(weights_ptr_mover);
- out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
- }
- out_temp += vreduce(out_temp_vec);
- for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+ const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+ uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
+
+ T out_temp = static_cast<T>(0);
+ for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+ ++index_wei_h, ++index_in_h)
{
- const auto src_val = *(in_ptr_mover);
- const auto w_val = *(weights_ptr_mover);
- out_temp += src_val * w_val;
+ const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;
+ const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
+ for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+ ++index_wei_w, ++index_in_w)
+ {
+ const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
+ const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+ int index_c = 0;
+ vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+ for (; index_c <= index_c_end - num_elems_read_per_iteration;
+ index_c += num_elems_read_per_iteration,
+ in_ptr_mover += num_elems_read_per_iteration,
+ weights_ptr_mover += num_elems_read_per_iteration)
+ {
+ const auto src_vec = wrapper::vloadq(in_ptr_mover);
+ const auto w_vec = wrapper::vloadq(weights_ptr_mover);
+ out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+ }
+ out_temp += vreduce(out_temp_vec);
+ for (; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+ {
+ const auto src_val = *(in_ptr_mover);
+ const auto w_val = *(weights_ptr_mover);
+ out_temp += src_val * w_val;
+ }
+ }
}
- }
- }
- *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+ *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+ },
+ wei);
},
- wei);
- },
- out);
+ out);
}
}
-template void convolve_nhwc<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nhwc<float>(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
} // namespace kernels
} // namespace cpu
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
index 3b26fcdf29..efb9ce8e2a 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
@@ -26,6 +26,7 @@
#define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
#include "arm_compute/core/ITensor.h"
+
#include "src/core/helpers/WindowHelpers.h"
namespace arm_compute
@@ -35,7 +36,8 @@ namespace cpu
namespace kernels
{
template <typename T>
-void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+void convolve_nhwc(
+ const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
index 6091ef215e..9b4375f17c 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
namespace arm_compute
@@ -35,14 +36,38 @@ void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso
return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window);
}
-template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -50,12 +75,30 @@ void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *
return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window);
}
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+} // namespace cpu
} // namespace arm_compute
#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
index 2d8fec91c5..53ccd89dcc 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
namespace arm_compute
@@ -34,25 +35,67 @@ void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso
return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window);
}
-template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window);
}
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
index 98b154e8fd..98f7e8b949 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
@@ -39,7 +39,7 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type
vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
- switch(op)
+ switch (op)
{
case ArithmeticOperation::MAX:
res = wrapper::vmax(a, b);
@@ -71,7 +71,9 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type
}
template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
+typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a,
+ const ScalarType &broadcast_value,
+ const bool reorder)
{
using tag_type = typename VectorType::tag_type;
using vec_type = typename VectorType::type;
@@ -81,10 +83,15 @@ typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorT
}
template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
- int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
- int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
+void elementwise_op(
+ const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window,
+ OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+ int (*broadcast_func)(
+ int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
+ int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
{
// Create input windows
Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -99,7 +106,7 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
const auto window_end_x = static_cast<int>(window.x().end());
const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -114,20 +121,26 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
- const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto a = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
- }
- },
- broadcast_input, non_broadcast_input, output);
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto non_broadcast_input_ptr =
+ reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+ const InputScalarType broadcast_value =
+ *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+ int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+ broadcast_value, output_ptr, !is_broadcast_input_2);
+ for (; x < window_end_x; ++x)
+ {
+ const auto a = *(non_broadcast_input_ptr + x);
+ *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+ !is_broadcast_input_2 ? a : broadcast_value);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -139,21 +152,23 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto a = *(input1_ptr + x);
- const auto b = *(input2_ptr + x);
- *(output_ptr + x) = (*scalar_func)(a, b);
- }
- },
- input1, input2, output);
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+ int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
+ for (; x < window_end_x; ++x)
+ {
+ const auto a = *(input1_ptr + x);
+ const auto b = *(input2_ptr + x);
+ *(output_ptr + x) = (*scalar_func)(a, b);
+ }
+ },
+ input1, input2, output);
}
}
@@ -162,7 +177,7 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
{
auto res = ScalarType(0);
- switch(op)
+ switch (op)
{
case ArithmeticOperation::MAX:
res = std::max(a, b);
@@ -183,10 +198,10 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
case ArithmeticOperation::DIV:
{
res = a / b;
- if(std::is_integral<ScalarType>::value)
+ if (std::is_integral<ScalarType>::value)
{
res = (b == 0) ? 0 : res;
- if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
+ if (static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
{
--res;
}
@@ -205,43 +220,56 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
}
template <>
-inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
+inline int32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a,
+ const int32x4_t &b)
{
return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
}
template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+ const float32x4_t &b)
{
return wrapper::vdiv(a, b);
}
template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+ const float32x4_t &b)
{
return wrapper::vpow(a, b);
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(
+ const float16x8_t &a, const float16x8_t &b)
{
return wrapper::vdiv(a, b);
}
template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+inline float16x8_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(
+ const float16x8_t &a, const float16x8_t &b)
{
return wrapper::vpow(a, b);
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
- const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
+inline int elementwise_arithm_op_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const ScalarType *input1_ptr,
+ const ScalarType *input2_ptr,
+ ScalarType *output_ptr)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
const auto a = wrapper::vloadq(input1_ptr + x);
const auto b = wrapper::vloadq(input2_ptr + x);
@@ -251,14 +279,20 @@ inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int
}
template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
+inline int elementwise_arithm_op_broadcast_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const ScalarType *non_broadcast_input_ptr,
+ const ScalarType &broadcast_value,
+ ScalarType *output_ptr,
+ const bool reorder)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
- wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
+ wrapper::vstore(output_ptr + x,
+ elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
}
return x;
}
@@ -268,10 +302,10 @@ void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out,
{
using scalar_type = typename VectorType::scalar_type;
- elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
- &elementwise_arithm_op_scalar<op, scalar_type>,
- &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
- &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
+ elementwise_op<scalar_type, scalar_type, VectorType>(
+ in1, in2, out, window, &elementwise_arithm_op_scalar<op, scalar_type>,
+ &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
+ &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
}
template <ComparisonOperation op, typename InputScalarType>
@@ -279,7 +313,7 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS
{
bool res = false;
- switch(op)
+ switch (op)
{
case ComparisonOperation::Equal:
res = (a == b);
@@ -308,9 +342,9 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS
template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
{
- OutputVectorType res = { 0, 0, 0, 0 };
+ OutputVectorType res = {0, 0, 0, 0};
- switch(op)
+ switch (op)
{
case ComparisonOperation::Equal:
res = wrapper::vceq(a, b);
@@ -338,53 +372,75 @@ inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const Inpu
}
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
+inline OutputVectorType
+elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
{
InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
- return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+ return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a,
+ reorder ? a : broadcast_vector);
}
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_8_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const InputScalarType *non_broadcast_input_ptr,
+ const InputScalarType &broadcast_value,
+ uint8_t *output_ptr,
+ const bool reorder)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(
+ wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
wrapper::vstore(output_ptr + x, a);
}
return x;
}
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_16_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const InputScalarType *non_broadcast_input_ptr,
+ const InputScalarType &broadcast_value,
+ uint8_t *output_ptr,
+ const bool reorder)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(
+ wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
}
return x;
}
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_32_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const InputScalarType *non_broadcast_input_ptr,
+ const InputScalarType &broadcast_value,
+ uint8_t *output_ptr,
+ const bool reorder)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
- const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+ wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
+ const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+ wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
}
- if(x <= window_end_x - 4)
+ if (x <= window_end_x - 4)
{
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
- for(int i = 0; i < 4; i++)
+ const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+ wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+ for (int i = 0; i < 4; i++)
{
*(output_ptr + x + i) = wrapper::vgetlane(a, i);
}
@@ -394,11 +450,15 @@ inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_
}
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_8_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const InputScalarType *input1_ptr,
+ const InputScalarType *input2_ptr,
+ uint8_t *output_ptr)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
const auto a = wrapper::vloadq(input1_ptr + x);
const auto b = wrapper::vloadq(input2_ptr + x);
@@ -409,11 +469,15 @@ inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int
}
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_16_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const InputScalarType *input1_ptr,
+ const InputScalarType *input2_ptr,
+ uint8_t *output_ptr)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
const auto a = wrapper::vloadq(input1_ptr + x);
const auto b = wrapper::vloadq(input2_ptr + x);
@@ -424,11 +488,15 @@ inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int
}
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_32_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const InputScalarType *input1_ptr,
+ const InputScalarType *input2_ptr,
+ uint8_t *output_ptr)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
auto a = wrapper::vloadq(input1_ptr + x);
auto b = wrapper::vloadq(input2_ptr + x);
@@ -438,12 +506,12 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int
const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
}
- if(x <= window_end_x - 4)
+ if (x <= window_end_x - 4)
{
const auto a = wrapper::vloadq(input1_ptr + x);
const auto b = wrapper::vloadq(input2_ptr + x);
const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
- for(int i = 0; i < 4; i++)
+ for (int i = 0; i < 4; i++)
{
*(output_ptr + x + i) = wrapper::vgetlane(res, i);
}
@@ -455,57 +523,59 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
- &elementwise_comp_op_scalar<op, InputScalarType>,
- &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
- &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
+ elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+ in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+ &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
+ &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
}
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
- &elementwise_comp_op_scalar<op, InputScalarType>,
- &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
- &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
+ elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+ in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+ &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
+ &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
}
template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
- elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
- &elementwise_comp_op_scalar<op, InputScalarType>,
- &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
- &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
+ elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+ in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+ &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
+ &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
}
inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
{
- qasymm8x16_t x = vld1q_u8(input1_ptr);
- const float32x4x4_t out =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
- }
- };
+ qasymm8x16_t x = vld1q_u8(input1_ptr);
+ const float32x4x4_t out = {{
+ vmulq_f32(
+ vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+ scale),
+ vmulq_f32(
+ vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+ scale),
+ vmulq_f32(
+ vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+ scale),
+ vmulq_f32(vcvtq_f32_s32(
+ vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+ scale),
+ }};
return out;
}
inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
{
- qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
- const float32x4x4_t out =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
- }
- };
+ qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
+ const float32x4x4_t out = {{
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+ vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+ }};
return out;
}
@@ -523,17 +593,15 @@ inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
vst1q_u8(output_ptr, vcombine_u8(pa, pb));
}
-inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void
+store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
{
- int32x4x4_t out =
- {
- {
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
- }
- };
+ int32x4x4_t out = {{
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+ }};
store_quantized(output_ptr, out);
}
@@ -544,17 +612,17 @@ inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
vst1q_s8(output_ptr, vcombine_s8(pa, pb));
}
-inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void store_quantized_signed(int8_t *output_ptr,
+ const float32x4x4_t &rf,
+ const float32x4_t &offset,
+ const float32x4_t &invscale)
{
- int32x4x4_t out =
- {
- {
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
- }
- };
+ int32x4x4_t out = {{
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+ vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+ }};
store_quantized_signed(output_ptr, out);
}
@@ -565,7 +633,8 @@ inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const floa
}
template <ArithmeticOperation op>
-inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
+inline int8_t
+elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
{
return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
}
@@ -574,15 +643,12 @@ template <ArithmeticOperation op>
float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
{
using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
- float32x4x4_t out =
- {
- {
- elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
- elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
- elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
- elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
- }
- };
+ float32x4x4_t out = {{
+ elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
+ elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
+ elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
+ elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
+ }};
return out;
}
@@ -596,26 +662,29 @@ inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float
template <ComparisonOperation op>
inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
{
- uint32x4x4_t out =
- {
- {
- elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
- elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
- elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
- elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
- }
- };
+ uint32x4x4_t out = {{elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
+ elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
+ elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
+ elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])}};
return out;
}
template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
- const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_arithm_op_quantized_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const uint8_t *input1_ptr,
+ const uint8_t *input2_ptr,
+ uint8_t *output_ptr,
+ int32x4_t voffset1,
+ int32x4_t voffset2,
+ float32x4_t vscale1,
+ float32x4_t vscale2,
+ float32x4_t voffseto,
+ float32x4_t invvscaleo)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
// Get inputs and compute output
const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
@@ -627,13 +696,21 @@ inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_e
}
template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x,
- const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const int8_t *input1_ptr,
+ const int8_t *input2_ptr,
+ int8_t *output_ptr,
+ int32x4_t voffset1,
+ int32x4_t voffset2,
+ float32x4_t vscale1,
+ float32x4_t vscale2,
+ float32x4_t voffseto,
+ float32x4_t invvscaleo)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
// Get inputs and compute output
const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
@@ -645,45 +722,71 @@ inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int w
}
template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
- int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
- float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const uint8_t *non_broadcast_input_ptr,
+ float32x4x4_t broadcast_vector,
+ uint8_t *output_ptr,
+ int32x4_t voffset_non_broadcast,
+ float32x4_t vscale_non_broadcast,
+ float32x4_t voffseto,
+ float32x4_t invvscaleo,
+ bool reorder)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+ const float32x4x4_t af =
+ load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+ const float32x4x4_t rf =
+ elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
}
return x;
}
template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr,
- int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
- float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const int8_t *non_broadcast_input_ptr,
+ float32x4x4_t broadcast_vector,
+ int8_t *output_ptr,
+ int32x4_t voffset_non_broadcast,
+ float32x4_t vscale_non_broadcast,
+ float32x4_t voffseto,
+ float32x4_t invvscaleo,
+ bool reorder)
{
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+ const float32x4x4_t af =
+ load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+ const float32x4x4_t rf =
+ elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
}
return x;
}
template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
- const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_comp_op_quantized_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const uint8_t *input1_ptr,
+ const uint8_t *input2_ptr,
+ uint8_t *output_ptr,
+ int32x4_t voffset1,
+ int32x4_t voffset2,
+ float32x4_t vscale1,
+ float32x4_t vscale2,
+ float32x4_t voffseto,
+ float32x4_t invvscaleo)
{
ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
@@ -694,14 +797,22 @@ inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end
}
template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x,
- const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_comp_op_quantized_signed_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const int8_t *input1_ptr,
+ const int8_t *input2_ptr,
+ uint8_t *output_ptr,
+ int32x4_t voffset1,
+ int32x4_t voffset2,
+ float32x4_t vscale1,
+ float32x4_t vscale2,
+ float32x4_t voffseto,
+ float32x4_t invvscaleo)
{
ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
@@ -712,46 +823,85 @@ inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int win
}
template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
- int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
- float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const uint8_t *non_broadcast_input_ptr,
+ float32x4x4_t broadcast_vector,
+ uint8_t *output_ptr,
+ int32x4_t voffset_non_broadcast,
+ float32x4_t vscale_non_broadcast,
+ float32x4_t voffseto,
+ float32x4_t invvscaleo,
+ bool reorder)
{
ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+ const float32x4x4_t af =
+ load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+ const uint32x4x4_t rf =
+ elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
store_quantized(output_ptr + x, rf);
}
return x;
}
template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
- int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
- float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x,
+ int window_end_x,
+ int window_step_x,
+ const int8_t *non_broadcast_input_ptr,
+ float32x4x4_t broadcast_vector,
+ uint8_t *output_ptr,
+ int32x4_t voffset_non_broadcast,
+ float32x4_t vscale_non_broadcast,
+ float32x4_t voffseto,
+ float32x4_t invvscaleo,
+ bool reorder)
{
ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+ const float32x4x4_t af =
+ load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+ const uint32x4x4_t rf =
+ elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
store_quantized(output_ptr + x, rf);
}
return x;
}
-inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+inline void elementwise_op_quantized(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window,
uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
- int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
- float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
- int32x4_t, int32x4_t, float32x4_t, float32x4_t,
- float32x4_t, float32x4_t))
+ int (*broadcast_func)(int,
+ int,
+ int,
+ const uint8_t *,
+ float32x4x4_t,
+ uint8_t *,
+ int32x4_t,
+ float32x4_t,
+ float32x4_t,
+ float32x4_t,
+ const bool),
+ int (*neon_func)(int,
+ int,
+ int,
+ const uint8_t *,
+ const uint8_t *,
+ uint8_t *,
+ int32x4_t,
+ int32x4_t,
+ float32x4_t,
+ float32x4_t,
+ float32x4_t,
+ float32x4_t))
{
// Create input windows
Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -772,7 +922,7 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe
const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f);
const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
// Select the broadcast input on the X axis
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -794,24 +944,28 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
- const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
- const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
+ const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+ const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
- voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
- for(; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
- const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
- *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
- }
- },
- broadcast_input, non_broadcast_input, output);
+ int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+ broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+ voffseto, invvscaleo, !is_broadcast_input_2);
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+ const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
+ *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+ !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -834,32 +988,56 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
- vscale1, vscale2, voffseto, invvscaleo);
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
- const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
- *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
- }
- },
- input1, input2, output);
+ const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+ voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
+ const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
+ *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+ }
+ },
+ input1, input2, output);
}
}
-inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
- int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
- float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
- int32x4_t, int32x4_t, float32x4_t, float32x4_t,
- float32x4_t, float32x4_t))
+inline void
+elementwise_comp_quantized_signed(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window,
+ uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+ int (*broadcast_func)(int,
+ int,
+ int,
+ const int8_t *,
+ float32x4x4_t,
+ uint8_t *,
+ int32x4_t,
+ float32x4_t,
+ float32x4_t,
+ float32x4_t,
+ const bool),
+ int (*neon_func)(int,
+ int,
+ int,
+ const int8_t *,
+ const int8_t *,
+ uint8_t *,
+ int32x4_t,
+ int32x4_t,
+ float32x4_t,
+ float32x4_t,
+ float32x4_t,
+ float32x4_t))
{
// Create input windows
Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -879,7 +1057,7 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor
const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
// Select the broadcast input on the X axis
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -901,24 +1079,28 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
- const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
- const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+ const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+ const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
- voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
- for(; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
- const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
- *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
- }
- },
- broadcast_input, non_broadcast_input, output);
+ int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+ broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+ voffseto, invvscaleo, !is_broadcast_input_2);
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+ const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+ *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+ !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -941,32 +1123,56 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
- vscale1, vscale2, voffseto, invvscaleo);
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
- const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
- *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
- }
- },
- input1, input2, output);
+ const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+ voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+ const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+ *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+ }
+ },
+ input1, input2, output);
}
}
-inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
- int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
- float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
- int32x4_t, int32x4_t, float32x4_t, float32x4_t,
- float32x4_t, float32x4_t))
+inline void
+elementwise_op_quantized_signed(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window,
+ int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+ int (*broadcast_func)(int,
+ int,
+ int,
+ const int8_t *,
+ float32x4x4_t,
+ int8_t *,
+ int32x4_t,
+ float32x4_t,
+ float32x4_t,
+ float32x4_t,
+ const bool),
+ int (*neon_func)(int,
+ int,
+ int,
+ const int8_t *,
+ const int8_t *,
+ int8_t *,
+ int32x4_t,
+ int32x4_t,
+ float32x4_t,
+ float32x4_t,
+ float32x4_t,
+ float32x4_t))
{
// Create input windows
Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -986,7 +1192,7 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i
const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
// Select the broadcast input on the X axis
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -1008,24 +1214,28 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
- const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
- const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+ const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+ const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
- voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
- for(; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
- const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
- *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
- }
- },
- broadcast_input, non_broadcast_input, output);
+ int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+ broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+ voffseto, invvscaleo, !is_broadcast_input_2);
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+ const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+ *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+ !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -1048,22 +1258,24 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
- vscale1, vscale2, voffseto, invvscaleo);
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
- const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
- *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
- }
- },
- input1, input2, output);
+ const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+ int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+ voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+ const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+ *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+ }
+ },
+ input1, input2, output);
}
}
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
index c5c528d3f3..09ad13d5eb 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
namespace arm_compute
{
@@ -33,63 +34,165 @@ void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window);
}
-template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ArithmeticOperation op>
void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window);
}
-template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window);
}
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window);
}
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window);
}
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
index fa8e08745a..d891f70644 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
namespace arm_compute
{
@@ -33,27 +34,72 @@ void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe
return elementwise_arithm_op_quantized<op>(in1, in2, out, window);
}
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
-void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window)
{
return elementwise_comp_op_quantized<op>(in1, in2, out, window);
}
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
index abfdf93b75..b1f8e018f5 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
namespace arm_compute
@@ -34,27 +35,70 @@ void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i
return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window);
}
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
-void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window)
{
return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window);
}
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
index 85224351df..600c7f1c05 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
@@ -25,6 +25,7 @@
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
namespace arm_compute
{
@@ -36,14 +37,38 @@ void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
return elementwise_arithmetic_op<float16_t>(in1, in2, out, op, window);
}
-template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -51,14 +76,32 @@ void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *i
return elementwise_comparison_op<float16_t>(in1, in2, out, op, window);
}
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
index 2b479f76f1..832a966883 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
namespace arm_compute
{
@@ -34,26 +35,68 @@ void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
return elementwise_arithmetic_op<float32_t>(in1, in2, out, op, window);
}
-template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_comparison_op<float>(in1, in2, out, op, window);
}
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
index c0515f2abc..fa48407e9b 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
@@ -23,7 +23,9 @@
*/
#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+
#include "src/core/NEON/SVEMath.h"
+
#include <arm_sve.h>
namespace arm_compute
@@ -33,7 +35,8 @@ namespace cpu
using namespace arm_compute::wrapper;
template <typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+void elementwise_arithmetic_op(
+ const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
{
using VectorType = typename sve_vector<ScalarType>::type;
@@ -51,7 +54,7 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
const auto window_end_x = static_cast<int>(window.x().end());
const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -66,37 +69,40 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
- const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
- const auto broadcast_vector = svdup_n(broadcast_value);
-
- int x = window_start_x;
-
- svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
- VectorType res{};
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+ const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+ const auto broadcast_vector = svdup_n(broadcast_value);
- if(is_broadcast_input_2)
- {
- res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op);
- }
- else
+ int x = window_start_x;
+
+ svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+ do
{
- res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op);
- }
- svst1(pg, output_ptr + x, res);
-
- x += svcnt<ScalarType>();
- pg = svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
+ const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+ VectorType res{};
+
+ if (is_broadcast_input_2)
+ {
+ res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector,
+ broadcast_vector, op);
+ }
+ else
+ {
+ res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(
+ pg, broadcast_vector, non_broadcast_vector, op);
+ }
+ svst1(pg, output_ptr + x, res);
+
+ x += svcnt<ScalarType>();
+ pg = svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -108,39 +114,46 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
- int x = window_start_x;
+ int x = window_start_x;
- svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
- do
- {
- const auto in1 = svld1(pg, input1_ptr + x);
- const auto in2 = svld1(pg, input2_ptr + x);
- const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op);
- svst1(pg, output_ptr + x, res);
-
- x += svcnt<ScalarType>();
- pg = svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
+ svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+ do
+ {
+ const auto in1 = svld1(pg, input1_ptr + x);
+ const auto in2 = svld1(pg, input2_ptr + x);
+ const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op);
+ svst1(pg, output_ptr + x, res);
+
+ x += svcnt<ScalarType>();
+ pg = svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ input1, input2, output);
}
}
-template void elementwise_arithmetic_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<float32_t>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<float16_t>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int16_t>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int32_t>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
template <typename InputScalarType, typename OutputScalarType>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+void elementwise_comparison_op(
+ const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
{
- static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+ static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+ "input data type's width should be equal to or greater than output data type's width");
using OutputVectorType = typename sve_vector<OutputScalarType>::type;
const auto all_true_pg = svptrue<InputScalarType>();
@@ -157,7 +170,7 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
const auto window_end_x = static_cast<int>(window.x().end());
const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -172,37 +185,44 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
- const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
- const auto broadcast_vector = svdup_n(broadcast_value);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto non_broadcast_input_ptr =
+ reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+ const InputScalarType broadcast_value =
+ *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+ const auto broadcast_vector = svdup_n(broadcast_value);
- int x = window_start_x;
+ int x = window_start_x;
- svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
- do
- {
- const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
- const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
- OutputVectorType res{};
- if(is_broadcast_input_2)
- {
- res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op);
- }
- else
+ svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+ do
{
- res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op);
- }
- svst1(output_pg, output_ptr + x, res);
-
- x += svcnt<InputScalarType>();
- pg = svwhilelt<InputScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
+ const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+ const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+ OutputVectorType res{};
+ if (is_broadcast_input_2)
+ {
+ res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+ typename sve_vector<OutputScalarType>::type>(
+ pg, non_broadcast_vector, broadcast_vector, op);
+ }
+ else
+ {
+ res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+ typename sve_vector<OutputScalarType>::type>(
+ pg, broadcast_vector, non_broadcast_vector, op);
+ }
+ svst1(output_pg, output_ptr + x, res);
+
+ x += svcnt<InputScalarType>();
+ pg = svwhilelt<InputScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -214,37 +234,45 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
Iterator input2(in2, input2_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
- int x = window_start_x;
+ int x = window_start_x;
- svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
- do
- {
- const auto in1 = svld1(pg, input1_ptr + x);
- const auto in2 = svld1(pg, input2_ptr + x);
- const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op);
- const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
- svst1(output_pg, output_ptr + x, res);
-
- x += svcnt<InputScalarType>();
- pg = svwhilelt<InputScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
+ svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+ do
+ {
+ const auto in1 = svld1(pg, input1_ptr + x);
+ const auto in2 = svld1(pg, input2_ptr + x);
+ const auto res =
+ elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+ typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op);
+ const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+ svst1(output_pg, output_ptr + x, res);
+
+ x += svcnt<InputScalarType>();
+ pg = svwhilelt<InputScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ input1, input2, output);
}
}
-template void elementwise_comparison_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<float32_t>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<float16_t>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<uint8_t>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int16_t>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int32_t>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
template <>
svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
index 860c50a1e0..4c61b9f315 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
@@ -25,6 +25,7 @@
#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include "src/core/NEON/wrapper/svtraits.h"
@@ -51,7 +52,7 @@ svbool_t narrow_to_byte_predicate(svbool_t pg)
{
const auto all_false = svpfalse();
- switch(bytewidth)
+ switch (bytewidth)
{
case 8:
pg = svuzp1_b32(pg, all_false);
@@ -74,7 +75,7 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve
using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
VectorType res{};
- switch(op)
+ switch (op)
{
case ArithmeticOperation::MAX:
res = svmax_z(pg, a, b);
@@ -114,11 +115,12 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve
}
template <typename InputVectorType, typename OutputVectorType>
-OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
+OutputVectorType
+elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
{
svbool_t selection_vector{};
- switch(op)
+ switch (op)
{
case ComparisonOperation::Equal:
selection_vector = svcmpeq(pg, a, b);
@@ -154,10 +156,12 @@ OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &
}
template <typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window);
+void elementwise_arithmetic_op(
+ const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window);
template <typename ScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window);
+void elementwise_comparison_op(
+ const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window);
} // namespace cpu
} // namespace arm_compute
#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
index c313fc6e04..f7714ff7e9 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
namespace arm_compute
{
@@ -33,64 +34,166 @@ void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
{
return elementwise_arithmetic_op<int32_t>(in1, in2, out, op, window);
}
-template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ArithmeticOperation op>
void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_arithmetic_op<int16_t>(in1, in2, out, op, window);
}
-template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_comparison_op<uint8_t>(in1, in2, out, op, window);
}
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_comparison_op<int16_t>(in1, in2, out, op, window);
}
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
return elementwise_comparison_op<int32_t>(in1, in2, out, op, window);
}
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
index 41e0ac77db..7c6015d379 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
@@ -35,19 +35,14 @@ inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint3
{
auto x = svld1(pg, ptr);
- const auto widened = svcreate4(
- svmovlb(svmovlb(x)),
- svmovlt(svmovlb(x)),
- svmovlb(svmovlt(x)),
- svmovlt(svmovlt(x)));
+ const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
pg = svptrue_b8();
- return svcreate4(
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
+ return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
+ svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
+ svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
+ svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
}
inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
@@ -56,28 +51,24 @@ inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint
//vprint(x);
- const auto widened = svcreate4(
- svmovlb(svmovlb(x)),
- svmovlt(svmovlb(x)),
- svmovlb(svmovlt(x)),
- svmovlt(svmovlt(x)));
+ const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
pg = svptrue_b8();
- return svcreate4(
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
+ return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
+ svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
+ svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
+ svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
}
-inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void
+store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
{
- const auto quantized = svcreate4(
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+ const auto quantized =
+ svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+ svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+ svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+ svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
@@ -85,13 +76,14 @@ inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const
svst1(pg, ptr, narrowed);
}
-inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void
+store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
{
- const auto quantized = svcreate4(
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+ const auto quantized =
+ svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+ svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+ svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+ svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
@@ -101,7 +93,8 @@ inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const
}
template <typename ScalarType>
-void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+void elementwise_arithmetic_quantized_op(
+ const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
{
const auto all_true_pg = wrapper::svptrue<ScalarType>();
@@ -120,7 +113,7 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -128,8 +121,10 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
- const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
- const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+ const auto non_broadcast_qinfo =
+ is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+ const auto broadcast_qinfo =
+ is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale);
@@ -141,48 +136,52 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
- const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
- const float broadcast_value_f = Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo);
- const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
-
- int x = window_start_x;
-
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
-
- svfloat32x4_t result{};
-
- if(!is_broadcast_input_2)
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+ const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+ const float broadcast_value_f =
+ Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+ const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+ svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
+
+ int x = window_start_x;
+
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ do
{
- result = svcreate4(
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op));
- }
- else
- {
- result = svcreate4(
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
- }
-
- store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
+ const auto in1 =
+ load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+
+ svfloat32x4_t result{};
+
+ if (!is_broadcast_input_2)
+ {
+ result =
+ svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op),
+ elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op),
+ elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op),
+ elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op));
+ }
+ else
+ {
+ result =
+ svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+ elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+ elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+ elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+ }
+
+ store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -200,41 +199,44 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
- int x = window_start_x;
+ int x = window_start_x;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
- {
- const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
- const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
-
- const auto result = svcreate4(
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
-
- store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ do
+ {
+ const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+ const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+
+ const auto result =
+ svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+ elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+ elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+ elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+
+ store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ input1, input2, output);
}
}
template <typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+void elementwise_comparison_quantized_op(
+ const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
{
- static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+ static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+ "input data type's width should be equal to or greater than output data type's width");
using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
const auto all_true_pg = wrapper::svptrue<InputScalarType>();
@@ -251,7 +253,7 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
const auto window_end_x = static_cast<int>(window.x().end());
const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -259,8 +261,10 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
- const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
- const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+ const auto non_broadcast_qinfo =
+ is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+ const auto broadcast_qinfo =
+ is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale);
@@ -272,51 +276,63 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
- const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
- const float broadcast_value_f = Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo);
- const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
-
- int x = window_start_x;
-
- svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
-
- svuint8x4_t result{};
-
- if(!is_broadcast_input_2)
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto non_broadcast_input_ptr =
+ reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+ const InputScalarType broadcast_value =
+ *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+ const float broadcast_value_f =
+ Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+ const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+ svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
+
+ int x = window_start_x;
+
+ svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+ do
{
- result = svcreate4(
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0), svget4(in1, 0), op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1), svget4(in1, 1), op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2), svget4(in1, 2), op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 3), svget4(in1, 3), op));
- }
- else
- {
- result = svcreate4(
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op));
- }
-
- const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
- const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
- const auto zipped = svzip1(zipped_bottom, zipped_top);
- svst1(pg, output_ptr + x, zipped);
-
- x += wrapper::svcnt<InputScalarType>();
- pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
+ const auto in1 =
+ load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+
+ svuint8x4_t result{};
+
+ if (!is_broadcast_input_2)
+ {
+ result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0),
+ svget4(in1, 0), op),
+ elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1),
+ svget4(in1, 1), op),
+ elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2),
+ svget4(in1, 2), op),
+ elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+ pg, svget4(in2, 3), svget4(in1, 3), op));
+ }
+ else
+ {
+ result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+ svget4(in2, 0), op),
+ elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+ svget4(in2, 1), op),
+ elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+ svget4(in2, 2), op),
+ elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+ pg, svget4(in1, 3), svget4(in2, 3), op));
+ }
+
+ const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+ const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
+ const auto zipped = svzip1(zipped_bottom, zipped_top);
+ svst1(pg, output_ptr + x, zipped);
+
+ x += wrapper::svcnt<InputScalarType>();
+ pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -334,39 +350,44 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+ const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
- int x = window_start_x;
+ int x = window_start_x;
- svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
- do
- {
- const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
- const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
- const auto result = svcreate4(
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op));
-
- const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
- const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
- const auto zipped = svzip1(zipped_bottom, zipped_top);
- svst1(pg, output_ptr + x, zipped);
-
- x += wrapper::svcnt<InputScalarType>();
- pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
+ svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+ do
+ {
+ const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+ const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+ const auto result =
+ svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+ svget4(in2, 0), op),
+ elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+ svget4(in2, 1), op),
+ elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+ svget4(in2, 2), op),
+ elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3),
+ svget4(in2, 3), op));
+
+ const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+ const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
+ const auto zipped = svzip1(zipped_bottom, zipped_top);
+ svst1(pg, output_ptr + x, zipped);
+
+ x += wrapper::svcnt<InputScalarType>();
+ pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ input1, input2, output);
}
}
} // namespace cpu
} // namespace arm_compute
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ \ No newline at end of file
+#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
index 7435bb4f29..5cc66642d7 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
namespace arm_compute
{
@@ -34,27 +35,72 @@ void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe
return elementwise_arithmetic_quantized_op<uint8_t>(in1, in2, out, op, window);
}
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
-void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window)
{
return elementwise_comparison_quantized_op<uint8_t>(in1, in2, out, op, window);
}
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
index 1027a1eed0..165e0c05fa 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
namespace arm_compute
{
@@ -34,27 +35,70 @@ void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i
return elementwise_arithmetic_quantized_op<int8_t>(in1, in2, out, op, window);
}
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
template <ComparisonOperation op>
-void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window)
{
return elementwise_comparison_quantized_op<int8_t>(in1, in2, out, op, window);
}
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+ const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
index b2833c2481..2588db024d 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
@@ -23,17 +23,19 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp16_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<__fp16>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
index 6566821eca..936a2e588a 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
@@ -22,16 +22,18 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp32_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<float>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
index dbc1dde4fa..d54d3984cb 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
@@ -36,7 +37,7 @@ namespace cpu
template <typename ScalarType>
inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
{
- switch(op)
+ switch (op)
{
case ElementWiseUnary::RSQRT:
return 1 / sqrt(a);
@@ -60,7 +61,7 @@ inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarTyp
template <typename ScalarType, typename VectorType>
inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
{
- switch(op)
+ switch (op)
{
case ElementWiseUnary::RSQRT:
return wrapper::vinvsqrt(a);
@@ -94,22 +95,24 @@ inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
-
- int x = window_start_x;
- for(; x <= window_end_x - window_step_x; x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
- }
- for(; x < window_end_x; ++x)
- {
- *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
- }
- },
- input, output);
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+
+ int x = window_start_x;
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
+ }
+ },
+ input, output);
}
template <>
@@ -128,75 +131,81 @@ inline void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- int8x16_t vout;
- auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
- const auto vconst_0_f32 = vdupq_n_f32(0);
- auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
-
- int x = window_start_x;
- for(; x <= window_end_x - window_step_x; x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto vin = wrapper::vloadq(input_ptr + x);
-
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
+ int8x16_t vout;
+ auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
+ const auto vconst_0_f32 = vdupq_n_f32(0);
+ auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
- // Perform activation
- float32x4x4_t vtmp_deq =
+ int x = window_start_x;
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
{
- {
+ const auto vin = wrapper::vloadq(input_ptr + x);
+
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
+
+ // Perform activation
+ float32x4x4_t vtmp_deq = {{
elementwise_op_imp<float>(op, vin_deq.val[0]),
elementwise_op_imp<float>(op, vin_deq.val[1]),
elementwise_op_imp<float>(op, vin_deq.val[2]),
elementwise_op_imp<float>(op, vin_deq.val[3]),
+ }};
+
+ if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+ {
+ vtmp_deq.val[0] =
+ vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+ vtmp_deq.val[1] =
+ vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+ vtmp_deq.val[2] =
+ vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+ vtmp_deq.val[3] =
+ vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
}
- };
- if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
- {
- vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
- vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
- vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
- vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+ // Re-quantize to new output space
+ vout = vquantize_signed(vtmp_deq, qi_out);
+ wrapper::vstore(output_ptr + x, vout);
}
-
- // Re-quantize to new output space
- vout = vquantize_signed(vtmp_deq, qi_out);
- wrapper::vstore(output_ptr + x, vout);
- }
- for(; x < window_end_x; ++x)
- {
- qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
- qasymm8_signed_t tmp = 0;
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- if(tmp_f <= 0.0)
+ for (; x < window_end_x; ++x)
{
- if(op == ElementWiseUnary::LOG)
- {
- tmp_f = (-128 - qi_out.offset) * qi_out.scale;
- }
- else if(op == ElementWiseUnary::RSQRT)
+ qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+ qasymm8_signed_t tmp = 0;
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ if (tmp_f <= 0.0)
{
- tmp_f = (127 - qi_out.offset) * qi_out.scale;
+ if (op == ElementWiseUnary::LOG)
+ {
+ tmp_f = (-128 - qi_out.offset) * qi_out.scale;
+ }
+ else if (op == ElementWiseUnary::RSQRT)
+ {
+ tmp_f = (127 - qi_out.offset) * qi_out.scale;
+ }
+ else
+ {
+ tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+ }
}
else
{
tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
}
+ tmp = quantize_qasymm8_signed(
+ tmp_f, qi_out,
+ RoundingPolicy::
+ TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
+ // For aarch64 LUT is used and rounding to nearest is used
+ *(output_ptr + x) = tmp;
}
- else
- {
- tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
- }
- tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
- // For aarch64 LUT is used and rounding to nearest is used
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
+ },
+ input, output);
}
template <>
inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
@@ -215,71 +224,74 @@ inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Windo
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- uint8x16_t vout;
- auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
- auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
- int x = window_start_x;
- for(; x <= window_end_x - window_step_x; x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto vin = wrapper::vloadq(input_ptr + x);
+ uint8x16_t vout;
+ auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+ auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ int x = window_start_x;
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(input_ptr + x);
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
+ // De-quantize
+ const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- float32x4x4_t vtmp_deq =
- {
- {
+ // Perform activation
+ float32x4x4_t vtmp_deq = {{
elementwise_op_imp<float>(op, vin_deq.val[0]),
elementwise_op_imp<float>(op, vin_deq.val[1]),
elementwise_op_imp<float>(op, vin_deq.val[2]),
elementwise_op_imp<float>(op, vin_deq.val[3]),
+ }};
+ if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+ {
+ vtmp_deq.val[0] =
+ vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+ vtmp_deq.val[1] =
+ vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+ vtmp_deq.val[2] =
+ vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+ vtmp_deq.val[3] =
+ vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
}
- };
- if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
- {
- vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
- vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
- vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
- vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
- }
- // Re-quantize to new output space
- vout = vquantize(vtmp_deq, qi_out);
- wrapper::vstore(output_ptr + x, vout);
- }
- for(; x < window_end_x; ++x)
- {
- qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
- qasymm8_t tmp = 0;
- float tmp_f = dequantize_qasymm8(in, qi_in);
- if(tmp_f <= 0.0)
+ // Re-quantize to new output space
+ vout = vquantize(vtmp_deq, qi_out);
+ wrapper::vstore(output_ptr + x, vout);
+ }
+ for (; x < window_end_x; ++x)
{
- if(op == ElementWiseUnary::LOG)
+ qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+ qasymm8_t tmp = 0;
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ if (tmp_f <= 0.0)
{
- tmp_f = (0 - qi_out.offset) * qi_out.scale;
- }
- else if(op == ElementWiseUnary::RSQRT)
- {
- tmp_f = (255 - qi_out.offset) * qi_out.scale;
+ if (op == ElementWiseUnary::LOG)
+ {
+ tmp_f = (0 - qi_out.offset) * qi_out.scale;
+ }
+ else if (op == ElementWiseUnary::RSQRT)
+ {
+ tmp_f = (255 - qi_out.offset) * qi_out.scale;
+ }
+ else
+ {
+ tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+ }
}
else
{
tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
}
+ tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
+ *(output_ptr + x) = tmp;
}
- else
- {
- tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
- }
- tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
+ },
+ input, output);
}
} // namespace cpu
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
index dfe5e30035..d4daad4ca6 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
@@ -22,16 +22,18 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_s32_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<int32_t>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
index 08bb7f28b6..38cb61d0ff 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/lut/list.h"
namespace arm_compute
@@ -32,24 +33,28 @@ namespace cpu
#ifdef __aarch64__
-void neon_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_q8_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(op);
- auto win = window;
+ auto win = window;
const auto window_end_x = window.x().end();
win.set(0, Window::Dimension(0, 1, 1));
Iterator src_it(in, win);
Iterator dst_it(out, win);
- execute_window_loop(win, [&](const Coordinates &) {
- const auto src_ptr = src_it.ptr();
- auto dst_ptr = dst_it.ptr();
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto src_ptr = src_it.ptr();
+ auto dst_ptr = dst_it.ptr();
- lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
- },
- src_it, dst_it);
+ lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+ },
+ src_it, dst_it);
}
#endif // __aarch64__
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
index d987f7747b..3e4b88eb47 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Window.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
@@ -31,7 +32,8 @@ namespace cpu
{
#ifndef __aarch64__
// Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<uint8_t>(in, out, window, op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
index e00970a1e0..a5f4b053e3 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Window.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
namespace arm_compute
@@ -31,7 +32,8 @@ namespace cpu
{
#ifndef __aarch64__
// Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_signed_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_signed_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_op<int8_t>(in, out, window, op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
index a883309b2e..22ff43c5d9 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
@@ -23,6 +23,7 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/CpuTypes.h"
#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
@@ -30,11 +31,12 @@ namespace arm_compute
{
namespace cpu
{
-void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp16_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_sve_op<float16_t>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
index b21ed8ddbc..394bd47adf 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/CpuTypes.h"
#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
@@ -30,10 +31,11 @@ namespace arm_compute
{
namespace cpu
{
-void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp32_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_sve_op<float32_t>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
index a948862906..5af534d9e7 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
namespace arm_compute
@@ -31,9 +32,10 @@ namespace arm_compute
namespace cpu
{
template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
{
- switch(op)
+ switch (op)
{
case ElementWiseUnary::RSQRT:
return svinvsqrt(pg, a);
@@ -55,9 +57,10 @@ inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::val
}
template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
{
- switch(op)
+ switch (op)
{
case ElementWiseUnary::NEG:
return svneg_z(pg, a);
@@ -81,23 +84,24 @@ void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, E
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
- int x = window_start_x;
-
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto vin = svld1(pg, input_ptr + x);
- svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input, output);
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+ int x = window_start_x;
+
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ do
+ {
+ const auto vin = svld1(pg, input_ptr + x);
+ svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
+ },
+ input, output);
}
template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
index 068c3f7cda..e27fe5a87f 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
@@ -23,16 +23,18 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_s32_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(lut);
return elementwise_sve_op<int32_t>(in, out, window, op);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
index 7e32f50132..4e4582debb 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
@@ -23,13 +23,15 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/lut/list.h"
namespace arm_compute
{
namespace cpu
{
-void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve2_q8_elementwise_unary(
+ const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
{
ARM_COMPUTE_UNUSED(op);
@@ -40,14 +42,16 @@ void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &wi
Iterator src_it(in, win);
Iterator dst_it(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = src_it.ptr();
- auto dst_ptr = dst_it.ptr();
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto src_ptr = src_it.ptr();
+ auto dst_ptr = dst_it.ptr();
- lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
- },
- src_it, dst_it);
+ lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+ },
+ src_it, dst_it);
}
} // namespace cpu
diff --git a/src/cpu/kernels/floor/list.h b/src/cpu/kernels/floor/list.h
index 4367e0ffc9..5ac78df324 100644
--- a/src/cpu/kernels/floor/list.h
+++ b/src/cpu/kernels/floor/list.h
@@ -28,8 +28,7 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_FLOOR_KERNEL(func_name) \
- void func_name(const void *src, void *dst, int len)
+#define DECLARE_FLOOR_KERNEL(func_name) void func_name(const void *src, void *dst, int len)
DECLARE_FLOOR_KERNEL(fp16_neon_floor);
DECLARE_FLOOR_KERNEL(fp32_neon_floor);
diff --git a/src/cpu/kernels/floor/neon/fp16.cpp b/src/cpu/kernels/floor/neon/fp16.cpp
index f362676a36..f47690277d 100644
--- a/src/cpu/kernels/floor/neon/fp16.cpp
+++ b/src/cpu/kernels/floor/neon/fp16.cpp
@@ -45,14 +45,14 @@ void fp16_neon_floor(const void *src, void *dst, int len)
auto psrc = static_cast<const __fp16 *>(src);
auto pdst = static_cast<__fp16 *>(dst);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
psrc += step;
pdst += step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*pdst = std::floor(*psrc);
++psrc;
diff --git a/src/cpu/kernels/floor/neon/fp32.cpp b/src/cpu/kernels/floor/neon/fp32.cpp
index f5efb2e849..a86e24d3c3 100644
--- a/src/cpu/kernels/floor/neon/fp32.cpp
+++ b/src/cpu/kernels/floor/neon/fp32.cpp
@@ -43,14 +43,14 @@ void fp32_neon_floor(const void *src, void *dst, int len)
auto psrc = static_cast<const float *>(src);
auto pdst = static_cast<float *>(dst);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
psrc += step;
pdst += step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*pdst = std::floor(*psrc);
++pdst;
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
index a29ee762fc..2821af32ce 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
@@ -29,11 +29,19 @@ namespace arm_compute
{
namespace cpu
{
-void fused_batch_normalization_conv_f16(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv_f16(const ITensor *conv_weights,
+ const ITensor *conv_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window)
{
- return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias,
- bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+ return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+ bn_var, bn_beta, bn_gamma, epsilon, window);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
index 076e97651d..3ca5b6977a 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
@@ -28,11 +28,19 @@ namespace arm_compute
{
namespace cpu
{
-void fused_batch_normalization_conv_f32(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv_f32(const ITensor *conv_weights,
+ const ITensor *conv_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window)
{
- return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias,
- bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+ return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+ bn_var, bn_beta, bn_gamma, epsilon, window);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
index b9017600d6..6fa843263a 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
@@ -25,6 +25,7 @@
#define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -32,8 +33,16 @@ namespace arm_compute
namespace cpu
{
template <typename T>
-void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv(const ITensor *conv_weights,
+ const ITensor *conv_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window)
{
using ScalarType = T;
const int size = 16 / conv_weights->info()->element_size();
@@ -53,13 +62,20 @@ void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *
Iterator conv_w_in(conv_weights, win);
Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win);
- const auto conv_bias_in = (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
- auto conv_bias_out = (run_in_place_bias ? conv_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+ const auto conv_bias_in =
+ (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+ auto conv_bias_out =
+ (run_in_place_bias ? conv_bias_in
+ : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_gamma = (bn_gamma != nullptr)
+ ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+ : nullptr;
+ const auto input_beta = (bn_beta != nullptr)
+ ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+ : nullptr;
auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
@@ -73,59 +89,61 @@ void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *
auto gamma = ScalarType(1.0);
auto beta = ScalarType(0.0);
auto conv_bias_in_scalar = ScalarType(0.0);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- var = input_var[id[3]];
- if(input_gamma != nullptr)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- gamma = input_gamma[id[3]];
- }
+ var = input_var[id[3]];
+ if (input_gamma != nullptr)
+ {
+ gamma = input_gamma[id[3]];
+ }
- if((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
- {
- if(input_beta != nullptr)
+ if ((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
{
- beta = input_beta[id[3]];
- beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ if (input_beta != nullptr)
+ {
+ beta = input_beta[id[3]];
+ beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ }
+
+ // Construct vectors
+ mean = input_mean[id[3]];
+ mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+
+ if (conv_bias_in != nullptr)
+ {
+ conv_bias_in_scalar = conv_bias_in[id[3]];
+ }
+ auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+ conv_bias_out[id[3]] = (conv_bias_tmp_scalar * gamma) + beta;
}
- // Construct vectors
- mean = input_mean[id[3]];
- mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+ int x = window_start_x;
+ auto conv_w_in_ptr = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
+ auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
+ var_vec = wrapper::vdup_n(var, ExactTagType{});
+ gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+ rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
- if(conv_bias_in != nullptr)
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- conv_bias_in_scalar = conv_bias_in[id[3]];
- }
- auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
- conv_bias_out[id[3]] = (conv_bias_tmp_scalar * gamma) + beta;
- }
-
- int x = window_start_x;
- auto conv_w_in_ptr = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
- auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
- var_vec = wrapper::vdup_n(var, ExactTagType{});
- gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
- rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- auto wn = wrapper::vloadq(conv_w_in_ptr + x);
- wn = wrapper::vmul(wn, rvar_vec);
- wn = wrapper::vmul(wn, gamma_vec);
+ auto wn = wrapper::vloadq(conv_w_in_ptr + x);
+ wn = wrapper::vmul(wn, rvar_vec);
+ wn = wrapper::vmul(wn, gamma_vec);
- // Store results
- wrapper::vstore(conv_w_out_ptr + x, wn);
- }
+ // Store results
+ wrapper::vstore(conv_w_out_ptr + x, wn);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
- }
- },
- conv_w_in, conv_w_out);
-}
-}
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+ }
+ },
+ conv_w_in, conv_w_out);
}
+} // namespace cpu
+} // namespace arm_compute
#endif //SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
diff --git a/src/cpu/kernels/fuse_batch_normalization/list.h b/src/cpu/kernels/fuse_batch_normalization/list.h
index e25b1e5fed..a03dd74f78 100644
--- a/src/cpu/kernels/fuse_batch_normalization/list.h
+++ b/src/cpu/kernels/fuse_batch_normalization/list.h
@@ -30,15 +30,18 @@ namespace cpu
{
#define DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(func_name) \
void func_name(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, \
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+ const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \
+ float epsilon, const Window &window)
#define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(func_name) \
void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+ const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \
+ float epsilon, const Window &window)
#define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(func_name) \
void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+ const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \
+ float epsilon, const Window &window)
DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f16);
DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f32);
@@ -50,7 +53,7 @@ DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(fused_batch_normalization_dwc_
#undef DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL
#undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL
#undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL
-}
-}
+} // namespace cpu
+} // namespace arm_compute
-#endif // \ No newline at end of file
+#endif //
diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
index 1e3be8792d..c0b0dfd4dc 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
@@ -29,8 +29,16 @@ namespace arm_compute
namespace cpu
{
template <typename T>
-void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights,
+ const ITensor *dwc_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window)
{
using ScalarType = T;
const int size = 16 / dwc_weights->info()->element_size();
@@ -50,13 +58,20 @@ void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITenso
Iterator dwc_w_in(dwc_weights, win);
Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
- const auto dwc_bias_in = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
- auto dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+ const auto dwc_bias_in =
+ (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+ auto dwc_bias_out =
+ (run_in_place_bias ? dwc_bias_in
+ : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_gamma = (bn_gamma != nullptr)
+ ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+ : nullptr;
+ const auto input_beta = (bn_beta != nullptr)
+ ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+ : nullptr;
auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
@@ -70,74 +85,92 @@ void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITenso
auto gamma = ScalarType(1.0);
auto beta = ScalarType(0.0);
auto dwc_bias_in_scalar = ScalarType(0.0);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- var = input_var[id[2]];
- if(input_gamma != nullptr)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- gamma = input_gamma[id[2]];
- }
-
- if(id[1] == 0)
- {
- mean = input_mean[id[2]];
-
- // Construct vectors
- mean_vec = wrapper::vdup_n(mean, ExactTagType{});
- if(input_beta != nullptr)
+ var = input_var[id[2]];
+ if (input_gamma != nullptr)
{
- beta = input_beta[id[2]];
- beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ gamma = input_gamma[id[2]];
}
- if(dwc_bias_in != nullptr)
+ if (id[1] == 0)
{
- dwc_bias_in_scalar = dwc_bias_in[id[2]];
+ mean = input_mean[id[2]];
+
+ // Construct vectors
+ mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+ if (input_beta != nullptr)
+ {
+ beta = input_beta[id[2]];
+ beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ }
+
+ if (dwc_bias_in != nullptr)
+ {
+ dwc_bias_in_scalar = dwc_bias_in[id[2]];
+ }
+
+ auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+ dwc_bias_out[id[2]] = (dwc_bias_tmp_scalar * gamma) + beta;
}
- auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
- dwc_bias_out[id[2]] = (dwc_bias_tmp_scalar * gamma) + beta;
- }
+ int x = window_start_x;
+ auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+ auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+ var_vec = wrapper::vdup_n(var, ExactTagType{});
+ gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+ rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
- int x = window_start_x;
- auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
- auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
- var_vec = wrapper::vdup_n(var, ExactTagType{});
- gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
- rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
- wn = wrapper::vmul(wn, rvar_vec);
- wn = wrapper::vmul(wn, gamma_vec);
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
+ wn = wrapper::vmul(wn, rvar_vec);
+ wn = wrapper::vmul(wn, gamma_vec);
- // Store results
- wrapper::vstore(dwc_w_out_ptr + x, wn);
- }
+ // Store results
+ wrapper::vstore(dwc_w_out_ptr + x, wn);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
- }
- },
- dwc_w_in, dwc_w_out);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+ }
+ },
+ dwc_w_in, dwc_w_out);
}
-void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights,
+ const ITensor *dwc_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window)
{
- return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
- bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+ return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+ bn_var, bn_beta, bn_gamma, epsilon, window);
}
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights,
+ const ITensor *dwc_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window)
{
- return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
- bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+ return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+ bn_var, bn_beta, bn_gamma, epsilon, window);
}
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
index 275211ff38..1d88d3b494 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
@@ -30,11 +30,19 @@ namespace arm_compute
{
namespace cpu
{
-void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights,
+ const ITensor *dwc_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window)
{
- return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
- bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+ return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+ bn_var, bn_beta, bn_gamma, epsilon, window);
}
} // namespace cpu
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
index 67169c5325..1f336bb196 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
@@ -29,11 +29,19 @@ namespace arm_compute
{
namespace cpu
{
-void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights,
+ const ITensor *dwc_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window)
{
- return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
- bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+ return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+ bn_var, bn_beta, bn_gamma, epsilon, window);
}
} // namespace cpu
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
index 6f0386276f..5b74a7aef6 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
@@ -25,6 +25,7 @@
#define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -32,8 +33,16 @@ namespace arm_compute
namespace cpu
{
template <typename T>
-void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights,
+ const ITensor *dwc_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window)
{
using ScalarType = T;
const int size = 16 / dwc_weights->info()->element_size();
@@ -53,13 +62,20 @@ void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITenso
Iterator dwc_w_in(dwc_weights, win);
Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
- const auto dwc_bias_in = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
- auto dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+ const auto dwc_bias_in =
+ (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+ auto dwc_bias_out =
+ (run_in_place_bias ? dwc_bias_in
+ : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_gamma = (bn_gamma != nullptr)
+ ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+ : nullptr;
+ const auto input_beta = (bn_beta != nullptr)
+ ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+ : nullptr;
auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{});
@@ -73,81 +89,84 @@ void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITenso
auto beta = ScalarType(0.0);
auto dwc_bias_in_scalar = ScalarType(0);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- var_vec = wrapper::vloadq(input_var + x);
- if(input_gamma != nullptr)
- {
- gamma_vec = wrapper::vloadq(input_gamma + x);
- }
-
- if((id[2] == 0) && (id[1] == 0))
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- mean_vec = wrapper::vloadq(input_mean + x);
-
- // Construct vectors
- if(input_beta != nullptr)
+ var_vec = wrapper::vloadq(input_var + x);
+ if (input_gamma != nullptr)
{
- beta_vec = wrapper::vloadq(input_beta + x);
+ gamma_vec = wrapper::vloadq(input_gamma + x);
}
- if(dwc_bias_in != nullptr)
+ if ((id[2] == 0) && (id[1] == 0))
{
- dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
+ mean_vec = wrapper::vloadq(input_mean + x);
+
+ // Construct vectors
+ if (input_beta != nullptr)
+ {
+ beta_vec = wrapper::vloadq(input_beta + x);
+ }
+
+ if (dwc_bias_in != nullptr)
+ {
+ dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
+ }
+
+ auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec),
+ wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
+ dwc_bias_tmp_vec = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
+ wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
}
- auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
- dwc_bias_tmp_vec = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
- wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
- }
-
- auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
- auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+ auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+ auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
- auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
- rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
- wn = wrapper::vmul(wn, rvar_vec);
- wn = wrapper::vmul(wn, gamma_vec);
+ auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
+ rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+ wn = wrapper::vmul(wn, rvar_vec);
+ wn = wrapper::vmul(wn, gamma_vec);
- // Store results
- wrapper::vstore(dwc_w_out_ptr + x, wn);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- auto var = input_var[x];
- if(input_gamma != nullptr)
- {
- gamma = input_gamma[x];
+ // Store results
+ wrapper::vstore(dwc_w_out_ptr + x, wn);
}
- if(id[2] == 0 && id[1] == 0)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- auto mean = input_mean[x];
- if(input_beta != nullptr)
+ auto var = input_var[x];
+ if (input_gamma != nullptr)
{
- beta = input_beta[x];
+ gamma = input_gamma[x];
}
- if(dwc_bias_in != nullptr)
+
+ if (id[2] == 0 && id[1] == 0)
{
- dwc_bias_in_scalar = dwc_bias_in[x];
+ auto mean = input_mean[x];
+ if (input_beta != nullptr)
+ {
+ beta = input_beta[x];
+ }
+ if (dwc_bias_in != nullptr)
+ {
+ dwc_bias_in_scalar = dwc_bias_in[x];
+ }
+
+ auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+ dwc_bias_out[x] = (dwc_bias_tmp_scalar * gamma) + beta;
}
- auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
- dwc_bias_out[x] = (dwc_bias_tmp_scalar * gamma) + beta;
- }
-
- const auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
- auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+ const auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+ auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
- *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
- }
- },
- dwc_w_in, dwc_w_out);
+ *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+ }
+ },
+ dwc_w_in, dwc_w_out);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
index 505a37174e..4d7507a5da 100644
--- a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
@@ -48,30 +48,32 @@ void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window,
Iterator in(src, win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
- int x = window_start_x;
- for(; x < (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x);
- const float16x8x2_t c = vld2q_f16(in_ptr + x);
- // Multiply matrix C by its weight and accumulate
- alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
- alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
+ const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
- vst2q_f16(out_ptr + x, alpha_ab);
- }
+ int x = window_start_x;
+ for (; x < (window_end_x - window_step_x); x += window_step_x)
+ {
+ float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x);
+ const float16x8x2_t c = vld2q_f16(in_ptr + x);
+ // Multiply matrix C by its weight and accumulate
+ alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
+ alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
- // Left-over loop
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
- }
- },
- in, out);
+ vst2q_f16(out_ptr + x, alpha_ab);
+ }
+
+ // Left-over loop
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
+ }
+ },
+ in, out);
}
} // namespace
void neon_fp16_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta)
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
index dd0384ca13..47de0f3928 100644
--- a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
@@ -23,6 +23,7 @@
*/
#include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h"
+
#include <arm_neon.h>
namespace arm_compute
@@ -44,33 +45,35 @@ void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window,
Iterator in(src, win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
- const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
- int x = window_start_x;
- for(; x < (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x);
- const float32x4x4_t c = vld4q_f32(in_ptr + x);
+ const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
+ const auto out_ptr = reinterpret_cast<float *>(out.ptr());
- // Multiply matrix C by its weight and accumulate
- alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
- alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
- alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
- alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
+ int x = window_start_x;
+ for (; x < (window_end_x - window_step_x); x += window_step_x)
+ {
+ float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x);
+ const float32x4x4_t c = vld4q_f32(in_ptr + x);
- vst4q_f32(out_ptr + x, alpha_ab);
- }
+ // Multiply matrix C by its weight and accumulate
+ alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
+ alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
+ alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
+ alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
- // Left-over loop
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) += *(in_ptr + x) * beta;
- }
- },
- in, out);
+ vst4q_f32(out_ptr + x, alpha_ab);
+ }
+
+ // Left-over loop
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) += *(in_ptr + x) * beta;
+ }
+ },
+ in, out);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
index 8fd79f9287..60fda511e3 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
@@ -32,7 +32,8 @@ namespace arm_compute
{
namespace cpu
{
-void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void vector_matrix_multiply_f16(
+ const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
{
const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0));
const auto in_b_stride = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size());
@@ -42,7 +43,8 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
const int window_start_x = 32 * info.thread_id;
const int window_step_x = 32 * info.num_threads;
const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
- ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
+ ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x,
+ " (window_end_x - window_start_x) must be multiple of window_step_x");
Window win_out(window);
win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -55,7 +57,7 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
Window win_b;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(rhs->info()->num_dimensions() >= 3)
+ if (rhs->info()->num_dimensions() >= 3)
{
win_b = window;
}
@@ -70,169 +72,172 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
- execute_window_loop(win_out, [&](const Coordinates &)
- {
- int x = window_start_x;
- // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
- // window_end_x is computed above which may cause out-of-bound writes to the dst.
- for(; x < (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_out,
+ [&](const Coordinates &)
{
- if(x > width_matrix_b)
+ int x = window_start_x;
+ // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+ // window_end_x is computed above which may cause out-of-bound writes to the dst.
+ for (; x < (window_end_x - window_step_x); x += window_step_x)
{
- return;
- }
-
- auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
+ if (x > width_matrix_b)
+ {
+ return;
+ }
- float16x8_t acc0 = vdupq_n_f16(0.f);
- float16x8_t acc1 = vdupq_n_f16(0.f);
- float16x8_t acc2 = vdupq_n_f16(0.f);
- float16x8_t acc3 = vdupq_n_f16(0.f);
+ auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
- auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr());
- const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
- for(; vec_a <= (vec_a_end_addr - 4);)
- {
- const float16x4_t a0l = vld1_f16(vec_a);
-
- float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
- float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
- float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
- float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
- float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
- float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
- float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
- float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
- acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
- acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
- acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
- acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
- acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
- acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
- acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
- acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
-
- matrix_b += 2 * in_b_stride;
-
- b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
- b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
- b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
- b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
- b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
- b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
- b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
- b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
- acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
- acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
- acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
- acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
- acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
- acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
- acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
- acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
-
- vec_a += 4;
- matrix_b += 2 * in_b_stride;
- }
+ float16x8_t acc0 = vdupq_n_f16(0.f);
+ float16x8_t acc1 = vdupq_n_f16(0.f);
+ float16x8_t acc2 = vdupq_n_f16(0.f);
+ float16x8_t acc3 = vdupq_n_f16(0.f);
- for(; vec_a < vec_a_end_addr; ++vec_a)
- {
- const float16_t a0 = *vec_a;
- const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
- const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
- const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
- const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-
- acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
- acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
- acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
- acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
-
- matrix_b += in_b_stride;
- }
+ auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr());
+ const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+ for (; vec_a <= (vec_a_end_addr - 4);)
+ {
+ const float16x4_t a0l = vld1_f16(vec_a);
+
+ float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+ float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+ float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+ float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+ float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+ float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+ float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+ float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+ matrix_b += 2 * in_b_stride;
+
+ b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+ b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+ b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+ b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+ b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+ b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+ b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+ b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+ acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+ acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+ acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+ acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+ vec_a += 4;
+ matrix_b += 2 * in_b_stride;
+ }
- // Multiply by the weight of matrix product (alpha)
- if(multiply_alpha)
- {
- acc0 = vmulq_f16(acc0, alpha_f16);
- acc1 = vmulq_f16(acc1, alpha_f16);
- acc2 = vmulq_f16(acc2, alpha_f16);
- acc3 = vmulq_f16(acc3, alpha_f16);
- }
+ for (; vec_a < vec_a_end_addr; ++vec_a)
+ {
+ const float16_t a0 = *vec_a;
+ const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+ const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+ const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+ const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+ acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+ acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+ acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+ acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+ matrix_b += in_b_stride;
+ }
- auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+ // Multiply by the weight of matrix product (alpha)
+ if (multiply_alpha)
+ {
+ acc0 = vmulq_f16(acc0, alpha_f16);
+ acc1 = vmulq_f16(acc1, alpha_f16);
+ acc2 = vmulq_f16(acc2, alpha_f16);
+ acc3 = vmulq_f16(acc3, alpha_f16);
+ }
- vst1q_f16(vec_out + 0, acc0);
- vst1q_f16(vec_out + 8, acc1);
- vst1q_f16(vec_out + 16, acc2);
- vst1q_f16(vec_out + 24, acc3);
- }
+ auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
- for(; x < window_end_x; ++x)
- {
- if(x > width_matrix_b)
- {
- return;
+ vst1q_f16(vec_out + 0, acc0);
+ vst1q_f16(vec_out + 8, acc1);
+ vst1q_f16(vec_out + 16, acc2);
+ vst1q_f16(vec_out + 24, acc3);
}
- auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
+ for (; x < window_end_x; ++x)
+ {
+ if (x > width_matrix_b)
+ {
+ return;
+ }
- float16x4_t vacc = vdup_n_f16(0.f);
+ auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
- auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr());
- const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
- for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
- {
- const float16x4_t a0l = vld1_f16(vec_a);
+ float16x4_t vacc = vdup_n_f16(0.f);
- const float16x4_t b_col =
+ auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr());
+ const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+ for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
{
- *(matrix_b + 0 * in_b_stride),
- *(matrix_b + 1 * in_b_stride),
- *(matrix_b + 2 * in_b_stride),
- *(matrix_b + 3 * in_b_stride),
- };
+ const float16x4_t a0l = vld1_f16(vec_a);
- vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
+ const float16x4_t b_col = {
+ *(matrix_b + 0 * in_b_stride),
+ *(matrix_b + 1 * in_b_stride),
+ *(matrix_b + 2 * in_b_stride),
+ *(matrix_b + 3 * in_b_stride),
+ };
- matrix_b += 4 * in_b_stride;
- }
+ vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
- float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
+ matrix_b += 4 * in_b_stride;
+ }
- for(; vec_a < vec_a_end_addr; ++vec_a)
- {
- const float16_t a0 = *vec_a;
- const float16_t b00 = *matrix_b;
+ float16_t acc =
+ vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
- acc += b00 * a0;
+ for (; vec_a < vec_a_end_addr; ++vec_a)
+ {
+ const float16_t a0 = *vec_a;
+ const float16_t b00 = *matrix_b;
- matrix_b += in_b_stride;
- }
+ acc += b00 * a0;
- // Multiply by the weight of matrix product (alpha)
- if(multiply_alpha)
- {
- acc *= static_cast<float16_t>(alpha);
- }
+ matrix_b += in_b_stride;
+ }
- auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+ // Multiply by the weight of matrix product (alpha)
+ if (multiply_alpha)
+ {
+ acc *= static_cast<float16_t>(alpha);
+ }
- *(vec_out) = acc;
- }
- },
- ina, inb, out);
+ auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+ *(vec_out) = acc;
+ }
+ },
+ ina, inb, out);
}
-void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void matrix_matrix_multiply_f16(
+ const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
{
ARM_COMPUTE_UNUSED(info);
- const int out_width = static_cast<int>(dst->info()->dimension(0));
- const int out_height = static_cast<int>(dst->info()->dimension(1));
- const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
- const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+ const int out_width = static_cast<int>(dst->info()->dimension(0));
+ const int out_height = static_cast<int>(dst->info()->dimension(1));
+ const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+ const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
const int num_elems_matrix_b_x = rhs->info()->dimension(0);
// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
@@ -243,7 +248,7 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
Window win_b;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(rhs->info()->num_dimensions() >= 3)
+ if (rhs->info()->num_dimensions() >= 3)
{
win_b = window;
}
@@ -259,22 +264,16 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto *mtx_a0 = reinterpret_cast<const float16_t *>(ina.ptr());
- const auto *mtx_b0 = reinterpret_cast<const float16_t *>(inb.ptr());
- auto *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
- float16x8x4_t c =
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- {
- vdupq_n_f16(0.f),
- vdupq_n_f16(0.f),
- vdupq_n_f16(0.f),
- vdupq_n_f16(0.f)
- }
- };
+ const auto *mtx_a0 = reinterpret_cast<const float16_t *>(ina.ptr());
+ const auto *mtx_b0 = reinterpret_cast<const float16_t *>(inb.ptr());
+ auto *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
+ float16x8x4_t c = {{vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f)}};
- /*
+ /*
This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
|a00 a01 a02 a03 | a04 a05 a06 a07|
|a10 a11 a12 a13 | a14 a15 a16 a17|
@@ -302,111 +301,118 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
*/
- const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-
- for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-
- {
- const float16x8_t p00 = vld1q_f16(mtx_a0);
- const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
-
- const float16x8_t q00 = vld1q_f16(mtx_b0);
- const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
- const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
- const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
-
- c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
- c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
- c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
- c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
+ const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
- c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
- c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
- c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
- c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
+ for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
- c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
- c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
- c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
- c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
-
- c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
- c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
- c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
- c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
-
- mtx_a0 += 16;
- mtx_b0 += 32;
- }
+ {
+ const float16x8_t p00 = vld1q_f16(mtx_a0);
+ const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
+
+ const float16x8_t q00 = vld1q_f16(mtx_b0);
+ const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
+ const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
+ const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
+
+ c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
+ c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
+ c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
+ c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
+
+ c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
+ c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
+ c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
+ c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
+
+ c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
+ c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
+ c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
+ c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
+
+ c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
+ c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
+ c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
+ c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
+
+ mtx_a0 += 16;
+ mtx_b0 += 32;
+ }
- for(; mtx_b0 < mtx_b0_end_addr;)
+ for (; mtx_b0 < mtx_b0_end_addr;)
- {
- const float16x4_t p00 = vld1_f16(mtx_a0);
- const float16x8_t q00 = vld1q_f16(mtx_b0);
+ {
+ const float16x4_t p00 = vld1_f16(mtx_a0);
+ const float16x8_t q00 = vld1q_f16(mtx_b0);
- c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
- c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
- c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
- c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
+ c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
+ c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
+ c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
+ c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
- mtx_a0 += 4;
- mtx_b0 += 8;
- }
+ mtx_a0 += 4;
+ mtx_b0 += 8;
+ }
- if(multiply_alpha)
- {
- c.val[0] = vmulq_f16(c.val[0], alpha_f16);
- c.val[1] = vmulq_f16(c.val[1], alpha_f16);
- c.val[2] = vmulq_f16(c.val[2], alpha_f16);
- c.val[3] = vmulq_f16(c.val[3], alpha_f16);
- }
+ if (multiply_alpha)
+ {
+ c.val[0] = vmulq_f16(c.val[0], alpha_f16);
+ c.val[1] = vmulq_f16(c.val[1], alpha_f16);
+ c.val[2] = vmulq_f16(c.val[2], alpha_f16);
+ c.val[3] = vmulq_f16(c.val[3], alpha_f16);
+ }
- if(id.x() < (out_width - 8))
- {
- vst1q_f16(mtx_out, c.val[0]);
- if(id.y() + 1 < out_height)
+ if (id.x() < (out_width - 8))
{
- vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
- if(id.y() + 2 < out_height)
+ vst1q_f16(mtx_out, c.val[0]);
+ if (id.y() + 1 < out_height)
{
- vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
- if(id.y() + 3 < out_height)
+ vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
+ if (id.y() + 2 < out_height)
{
- vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+ vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
+ if (id.y() + 3 < out_height)
+ {
+ vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+ }
}
}
}
- }
- else
- {
- // Left-over columns
- const int columns_left = out_width - id.x();
- for(int x = 0; x < columns_left; ++x)
+ else
{
- *(mtx_out + x) = c.val[0][x];
- if(id.y() + 1 < out_height)
+ // Left-over columns
+ const int columns_left = out_width - id.x();
+ for (int x = 0; x < columns_left; ++x)
{
- *(mtx_out + x + 1 * out_stride) = c.val[1][x];
- if(id.y() + 2 < out_height)
+ *(mtx_out + x) = c.val[0][x];
+ if (id.y() + 1 < out_height)
{
- *(mtx_out + x + 2 * out_stride) = c.val[2][x];
- if(id.y() + 3 < out_height)
+ *(mtx_out + x + 1 * out_stride) = c.val[1][x];
+ if (id.y() + 2 < out_height)
{
- *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+ *(mtx_out + x + 2 * out_stride) = c.val[2][x];
+ if (id.y() + 3 < out_height)
+ {
+ *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+ }
}
}
}
}
- }
- },
- ina, inb, out);
+ },
+ ina, inb, out);
}
-void neon_fp16_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+void neon_fp16_gemm_matrix_mul(const ITensor *lhs,
+ const ITensor *rhs,
+ ITensor *dst,
+ const Window &window,
+ const ThreadInfo &info,
+ float alpha,
+ const bool is_dst_vector)
{
- return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha);
+ return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha)
+ : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha);
}
-} // namespce cpu
+} // namespace cpu
} // namespace arm_compute
#endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
index 9c1f6f3c0f..e12a312280 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
@@ -28,9 +28,16 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp32_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+void neon_fp32_gemm_matrix_mul(const ITensor *lhs,
+ const ITensor *rhs,
+ ITensor *dst,
+ const Window &window,
+ const ThreadInfo &info,
+ float alpha,
+ const bool is_dst_vector)
{
- return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha);
+ return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha)
+ : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha);
}
-} // namespce cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
index 0051d3d9dc..404d070a37 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
@@ -23,6 +23,7 @@
*/
#include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h"
+
#include "src/core/utils/helpers/float_ops.h"
#include <arm_neon.h>
@@ -31,10 +32,12 @@ namespace arm_compute
{
namespace cpu
{
-void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void vector_matrix_multiply_f32(
+ const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
{
- const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0));
- const auto in_b_stride = static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
+ const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0));
+ const auto in_b_stride =
+ static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
// The implementation computes 16 elements per iteration
@@ -54,7 +57,7 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
Window win_b;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(rhs->info()->num_dimensions() >= 3)
+ if (rhs->info()->num_dimensions() >= 3)
{
win_b = window;
}
@@ -69,209 +72,220 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
- execute_window_loop(win_out, [&](const Coordinates &)
- {
- int x = window_start_x;
- // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
- // window_end_x is computed above which may cause out-of-bound writes to the dst.
- for(; x < (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_out,
+ [&](const Coordinates &)
{
- if(x > width_matrix_b)
+ int x = window_start_x;
+ // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+ // window_end_x is computed above which may cause out-of-bound writes to the dst.
+ for (; x < (window_end_x - window_step_x); x += window_step_x)
{
- return;
- }
+ if (x > width_matrix_b)
+ {
+ return;
+ }
- float32x4_t acc0 = vdupq_n_f32(0.f);
- float32x4_t acc1 = vdupq_n_f32(0.f);
- float32x4_t acc2 = vdupq_n_f32(0.f);
- float32x4_t acc3 = vdupq_n_f32(0.f);
+ float32x4_t acc0 = vdupq_n_f32(0.f);
+ float32x4_t acc1 = vdupq_n_f32(0.f);
+ float32x4_t acc2 = vdupq_n_f32(0.f);
+ float32x4_t acc3 = vdupq_n_f32(0.f);
- auto vec_a = reinterpret_cast<const float *>(ina.ptr());
- auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
+ auto vec_a = reinterpret_cast<const float *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
#endif /* __arm__ */
- auto vec_a_end_addr = vec_a + num_elems_vec_a;
- for(; vec_a <= (vec_a_end_addr - 4);)
- {
- float32x2_t a0l = vld1_f32(vec_a);
+ auto vec_a_end_addr = vec_a + num_elems_vec_a;
+ for (; vec_a <= (vec_a_end_addr - 4);)
+ {
+ float32x2_t a0l = vld1_f32(vec_a);
- float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
- float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
- float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
- float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+ float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+ float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+ float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+ float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
- float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
- float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
- float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
- float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+ float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+ float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+ float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+ float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+ asm volatile(
+ "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+ asm volatile(
+ "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+ asm volatile(
+ "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+ asm volatile(
+ "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
#endif /* __arm__ */
- acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
- acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
- acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
- acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+ acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+ acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+ acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+ acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
- acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
- acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
- acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
- acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+ acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+ acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+ acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+ acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
- vec_a += 2;
- matrix_b += 2 * in_b_stride;
+ vec_a += 2;
+ matrix_b += 2 * in_b_stride;
- a0l = vld1_f32(vec_a);
+ a0l = vld1_f32(vec_a);
- b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
- b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
- b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
- b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+ b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+ b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+ b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+ b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
- b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
- b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
- b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
- b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+ b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+ b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+ b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+ b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
- acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
- acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
- acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
- acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+ acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+ acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+ acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+ acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
- acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
- acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
- acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
- acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+ acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+ acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+ acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+ acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
- vec_a += 2;
- matrix_b += 2 * in_b_stride;
- }
+ vec_a += 2;
+ matrix_b += 2 * in_b_stride;
+ }
- for(; vec_a < vec_a_end_addr; ++vec_a)
- {
- const float a0 = *vec_a;
+ for (; vec_a < vec_a_end_addr; ++vec_a)
+ {
+ const float a0 = *vec_a;
- const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
- const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
- const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
- const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+ const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+ const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+ const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+ const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
- acc0 = vmlaq_n_f32(acc0, b00, a0);
- acc1 = vmlaq_n_f32(acc1, b01, a0);
- acc2 = vmlaq_n_f32(acc2, b02, a0);
- acc3 = vmlaq_n_f32(acc3, b03, a0);
+ acc0 = vmlaq_n_f32(acc0, b00, a0);
+ acc1 = vmlaq_n_f32(acc1, b01, a0);
+ acc2 = vmlaq_n_f32(acc2, b02, a0);
+ acc3 = vmlaq_n_f32(acc3, b03, a0);
- matrix_b += in_b_stride;
- }
+ matrix_b += in_b_stride;
+ }
- // Multiply by the weight of matrix product (alpha)
- if(multiply_alpha)
- {
- acc0 = vmulq_f32(acc0, alpha_f32);
- acc1 = vmulq_f32(acc1, alpha_f32);
- acc2 = vmulq_f32(acc2, alpha_f32);
- acc3 = vmulq_f32(acc3, alpha_f32);
- }
+ // Multiply by the weight of matrix product (alpha)
+ if (multiply_alpha)
+ {
+ acc0 = vmulq_f32(acc0, alpha_f32);
+ acc1 = vmulq_f32(acc1, alpha_f32);
+ acc2 = vmulq_f32(acc2, alpha_f32);
+ acc3 = vmulq_f32(acc3, alpha_f32);
+ }
- const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+ const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
- vst1q_f32(vec_out + 0, acc0);
- vst1q_f32(vec_out + 4, acc1);
- vst1q_f32(vec_out + 8, acc2);
- vst1q_f32(vec_out + 12, acc3);
- }
+ vst1q_f32(vec_out + 0, acc0);
+ vst1q_f32(vec_out + 4, acc1);
+ vst1q_f32(vec_out + 8, acc2);
+ vst1q_f32(vec_out + 12, acc3);
+ }
- // Left-over loop
- for(; x < window_end_x; ++x)
- {
- if(x > width_matrix_b)
+ // Left-over loop
+ for (; x < window_end_x; ++x)
{
- return;
- }
+ if (x > width_matrix_b)
+ {
+ return;
+ }
- float32x4_t vacc = vdupq_n_f32(0.f);
+ float32x4_t vacc = vdupq_n_f32(0.f);
- auto vec_a = reinterpret_cast<const float *>(ina.ptr());
- auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
+ auto vec_a = reinterpret_cast<const float *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
#endif /* __arm__ */
- auto vec_a_end_addr = vec_a + num_elems_vec_a;
- for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
- {
- const float32x4_t a0l = vld1q_f32(vec_a);
-
- const float32x4_t b_col =
+ auto vec_a_end_addr = vec_a + num_elems_vec_a;
+ for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
{
- *(matrix_b + 0 * in_b_stride),
- *(matrix_b + 1 * in_b_stride),
- *(matrix_b + 2 * in_b_stride),
- *(matrix_b + 3 * in_b_stride),
- };
+ const float32x4_t a0l = vld1q_f32(vec_a);
+
+ const float32x4_t b_col = {
+ *(matrix_b + 0 * in_b_stride),
+ *(matrix_b + 1 * in_b_stride),
+ *(matrix_b + 2 * in_b_stride),
+ *(matrix_b + 3 * in_b_stride),
+ };
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+ asm volatile(
+ "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+ asm volatile(
+ "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+ asm volatile(
+ "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+ asm volatile(
+ "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
#endif /* __arm__ */
- vacc = vmlaq_f32(vacc, b_col, a0l);
+ vacc = vmlaq_f32(vacc, b_col, a0l);
- matrix_b += 4 * in_b_stride;
- }
+ matrix_b += 4 * in_b_stride;
+ }
- float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3);
+ float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) +
+ vgetq_lane_f32(vacc, 3);
- for(; vec_a < vec_a_end_addr; ++vec_a)
- {
- const float a0 = *vec_a;
+ for (; vec_a < vec_a_end_addr; ++vec_a)
+ {
+ const float a0 = *vec_a;
- const float b00 = *matrix_b;
+ const float b00 = *matrix_b;
- acc += b00 * a0;
+ acc += b00 * a0;
- matrix_b += in_b_stride;
- }
+ matrix_b += in_b_stride;
+ }
- // Multiply by the weight of matrix product (alpha)
- if(multiply_alpha)
- {
- acc *= alpha;
- }
+ // Multiply by the weight of matrix product (alpha)
+ if (multiply_alpha)
+ {
+ acc *= alpha;
+ }
- const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+ const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
- *vec_out = acc;
- }
- },
- ina, inb, out);
+ *vec_out = acc;
+ }
+ },
+ ina, inb, out);
}
-void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void matrix_matrix_multiply_f32(
+ const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
{
ARM_COMPUTE_UNUSED(info);
- const int out_width = static_cast<int>(dst->info()->dimension(0));
- const int out_height = static_cast<int>(dst->info()->dimension(1));
- const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
- const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
- const size_t out_stride2 = out_stride1 * 2;
- const size_t out_stride3 = out_stride1 * 3;
+ const int out_width = static_cast<int>(dst->info()->dimension(0));
+ const int out_height = static_cast<int>(dst->info()->dimension(1));
+ const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+ const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+ const size_t out_stride2 = out_stride1 * 2;
+ const size_t out_stride3 = out_stride1 * 3;
const int num_elems_matrix_b_x = rhs->info()->dimension(0);
// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
@@ -282,7 +296,7 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
Window win_b;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
// This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(rhs->info()->num_dimensions() >= 3)
+ if (rhs->info()->num_dimensions() >= 3)
{
win_b = window;
}
@@ -302,338 +316,340 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
// All the values needed for computing a single 4x4 block will be read from consecutive memory positions
- execute_window_loop(window, [&](const Coordinates & id)
- {
- auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
- auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
- auto mtx_b1 = mtx_b0 + in_b_stride;
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
+ auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
+ auto mtx_b1 = mtx_b0 + in_b_stride;
- float32x4_t acc00 = vdupq_n_f32(0.f);
- float32x4_t acc10 = vdupq_n_f32(0.f);
- float32x4_t acc20 = vdupq_n_f32(0.f);
- float32x4_t acc30 = vdupq_n_f32(0.f);
+ float32x4_t acc00 = vdupq_n_f32(0.f);
+ float32x4_t acc10 = vdupq_n_f32(0.f);
+ float32x4_t acc20 = vdupq_n_f32(0.f);
+ float32x4_t acc30 = vdupq_n_f32(0.f);
- float32x4_t acc01 = vdupq_n_f32(0.f);
- float32x4_t acc11 = vdupq_n_f32(0.f);
- float32x4_t acc21 = vdupq_n_f32(0.f);
- float32x4_t acc31 = vdupq_n_f32(0.f);
+ float32x4_t acc01 = vdupq_n_f32(0.f);
+ float32x4_t acc11 = vdupq_n_f32(0.f);
+ float32x4_t acc21 = vdupq_n_f32(0.f);
+ float32x4_t acc31 = vdupq_n_f32(0.f);
#if __arm__
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
- asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+ asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
#endif /* __arm__ */
- auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
- for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
- {
- float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
- float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
- float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
- float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+ auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+ for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
+ {
+ float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+ float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+ float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+ float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
- float32x4_t b00 = vld1q_f32(mtx_b0);
- float32x4_t b10 = vld1q_f32(mtx_b1);
- float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
- float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+ float32x4_t b00 = vld1q_f32(mtx_b0);
+ float32x4_t b10 = vld1q_f32(mtx_b1);
+ float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+ float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
#endif /* __arm__ */
- // 4x4 block 0
- acc00 = vmlaq_f32(acc00, b00, a0);
- acc10 = vmlaq_f32(acc10, b00, a1);
- acc20 = vmlaq_f32(acc20, b00, a2);
- acc30 = vmlaq_f32(acc30, b00, a3);
-
- float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
- float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
- float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
- float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
-
- // 4x4 block 1
- acc01 = vmlaq_f32(acc01, b10, a0);
- acc11 = vmlaq_f32(acc11, b10, a1);
- acc21 = vmlaq_f32(acc21, b10, a2);
- acc31 = vmlaq_f32(acc31, b10, a3);
-
- // 4x4 block 0
- acc00 = vmlaq_f32(acc00, b01, a4);
- acc10 = vmlaq_f32(acc10, b01, a5);
- acc20 = vmlaq_f32(acc20, b01, a6);
- acc30 = vmlaq_f32(acc30, b01, a7);
-
- // 4x4 block 1
- acc01 = vmlaq_f32(acc01, b11, a4);
- acc11 = vmlaq_f32(acc11, b11, a5);
- acc21 = vmlaq_f32(acc21, b11, a6);
- acc31 = vmlaq_f32(acc31, b11, a7);
-
- mtx_a0 += 8;
- mtx_b0 += 8;
- mtx_b1 += 8;
-
- a0 = vld1q_dup_f32(mtx_a0 + 0);
- a1 = vld1q_dup_f32(mtx_a0 + 1);
- a2 = vld1q_dup_f32(mtx_a0 + 2);
- a3 = vld1q_dup_f32(mtx_a0 + 3);
-
- b00 = vld1q_f32(mtx_b0);
- b10 = vld1q_f32(mtx_b1);
- b01 = vld1q_f32(mtx_b0 + 4);
- b11 = vld1q_f32(mtx_b1 + 4);
-
- // 4x4 block 0
- acc00 = vmlaq_f32(acc00, b00, a0);
- acc10 = vmlaq_f32(acc10, b00, a1);
- acc20 = vmlaq_f32(acc20, b00, a2);
- acc30 = vmlaq_f32(acc30, b00, a3);
-
- a4 = vld1q_dup_f32(mtx_a0 + 4);
- a5 = vld1q_dup_f32(mtx_a0 + 5);
- a6 = vld1q_dup_f32(mtx_a0 + 6);
- a7 = vld1q_dup_f32(mtx_a0 + 7);
-
- // 4x4 block 1
- acc01 = vmlaq_f32(acc01, b10, a0);
- acc11 = vmlaq_f32(acc11, b10, a1);
- acc21 = vmlaq_f32(acc21, b10, a2);
- acc31 = vmlaq_f32(acc31, b10, a3);
-
- // 4x4 block 0
- acc00 = vmlaq_f32(acc00, b01, a4);
- acc10 = vmlaq_f32(acc10, b01, a5);
- acc20 = vmlaq_f32(acc20, b01, a6);
- acc30 = vmlaq_f32(acc30, b01, a7);
-
- // 4x4 block 1
- acc01 = vmlaq_f32(acc01, b11, a4);
- acc11 = vmlaq_f32(acc11, b11, a5);
- acc21 = vmlaq_f32(acc21, b11, a6);
- acc31 = vmlaq_f32(acc31, b11, a7);
-
- mtx_a0 += 8;
- mtx_b0 += 8;
- mtx_b1 += 8;
-
- a0 = vld1q_dup_f32(mtx_a0 + 0);
- a1 = vld1q_dup_f32(mtx_a0 + 1);
- a2 = vld1q_dup_f32(mtx_a0 + 2);
- a3 = vld1q_dup_f32(mtx_a0 + 3);
- b00 = vld1q_f32(mtx_b0);
- b10 = vld1q_f32(mtx_b1);
- b01 = vld1q_f32(mtx_b0 + 4);
- b11 = vld1q_f32(mtx_b1 + 4);
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+ float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+ float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+ float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b01, a4);
+ acc10 = vmlaq_f32(acc10, b01, a5);
+ acc20 = vmlaq_f32(acc20, b01, a6);
+ acc30 = vmlaq_f32(acc30, b01, a7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b11, a4);
+ acc11 = vmlaq_f32(acc11, b11, a5);
+ acc21 = vmlaq_f32(acc21, b11, a6);
+ acc31 = vmlaq_f32(acc31, b11, a7);
+
+ mtx_a0 += 8;
+ mtx_b0 += 8;
+ mtx_b1 += 8;
+
+ a0 = vld1q_dup_f32(mtx_a0 + 0);
+ a1 = vld1q_dup_f32(mtx_a0 + 1);
+ a2 = vld1q_dup_f32(mtx_a0 + 2);
+ a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+ b00 = vld1q_f32(mtx_b0);
+ b10 = vld1q_f32(mtx_b1);
+ b01 = vld1q_f32(mtx_b0 + 4);
+ b11 = vld1q_f32(mtx_b1 + 4);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ a4 = vld1q_dup_f32(mtx_a0 + 4);
+ a5 = vld1q_dup_f32(mtx_a0 + 5);
+ a6 = vld1q_dup_f32(mtx_a0 + 6);
+ a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b01, a4);
+ acc10 = vmlaq_f32(acc10, b01, a5);
+ acc20 = vmlaq_f32(acc20, b01, a6);
+ acc30 = vmlaq_f32(acc30, b01, a7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b11, a4);
+ acc11 = vmlaq_f32(acc11, b11, a5);
+ acc21 = vmlaq_f32(acc21, b11, a6);
+ acc31 = vmlaq_f32(acc31, b11, a7);
+
+ mtx_a0 += 8;
+ mtx_b0 += 8;
+ mtx_b1 += 8;
+
+ a0 = vld1q_dup_f32(mtx_a0 + 0);
+ a1 = vld1q_dup_f32(mtx_a0 + 1);
+ a2 = vld1q_dup_f32(mtx_a0 + 2);
+ a3 = vld1q_dup_f32(mtx_a0 + 3);
+ b00 = vld1q_f32(mtx_b0);
+ b10 = vld1q_f32(mtx_b1);
+ b01 = vld1q_f32(mtx_b0 + 4);
+ b11 = vld1q_f32(mtx_b1 + 4);
#if __arm__
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
- asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+ asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
#endif /* __arm__ */
- // 4x4 block 0
- acc00 = vmlaq_f32(acc00, b00, a0);
- acc10 = vmlaq_f32(acc10, b00, a1);
- acc20 = vmlaq_f32(acc20, b00, a2);
- acc30 = vmlaq_f32(acc30, b00, a3);
-
- a4 = vld1q_dup_f32(mtx_a0 + 4);
- a5 = vld1q_dup_f32(mtx_a0 + 5);
- a6 = vld1q_dup_f32(mtx_a0 + 6);
- a7 = vld1q_dup_f32(mtx_a0 + 7);
-
- // 4x4 block 1
- acc01 = vmlaq_f32(acc01, b10, a0);
- acc11 = vmlaq_f32(acc11, b10, a1);
- acc21 = vmlaq_f32(acc21, b10, a2);
- acc31 = vmlaq_f32(acc31, b10, a3);
-
- // 4x4 block 0
- acc00 = vmlaq_f32(acc00, b01, a4);
- acc10 = vmlaq_f32(acc10, b01, a5);
- acc20 = vmlaq_f32(acc20, b01, a6);
- acc30 = vmlaq_f32(acc30, b01, a7);
-
- // 4x4 block 1
- acc01 = vmlaq_f32(acc01, b11, a4);
- acc11 = vmlaq_f32(acc11, b11, a5);
- acc21 = vmlaq_f32(acc21, b11, a6);
- acc31 = vmlaq_f32(acc31, b11, a7);
-
- mtx_a0 += 8;
- mtx_b0 += 8;
- mtx_b1 += 8;
-
- a0 = vld1q_dup_f32(mtx_a0 + 0);
- a1 = vld1q_dup_f32(mtx_a0 + 1);
- a2 = vld1q_dup_f32(mtx_a0 + 2);
- a3 = vld1q_dup_f32(mtx_a0 + 3);
- b00 = vld1q_f32(mtx_b0);
- b10 = vld1q_f32(mtx_b1);
- b01 = vld1q_f32(mtx_b0 + 4);
- b11 = vld1q_f32(mtx_b1 + 4);
-
- // 4x4 block 0
- acc00 = vmlaq_f32(acc00, b00, a0);
- acc10 = vmlaq_f32(acc10, b00, a1);
- acc20 = vmlaq_f32(acc20, b00, a2);
- acc30 = vmlaq_f32(acc30, b00, a3);
-
- a4 = vld1q_dup_f32(mtx_a0 + 4);
- a5 = vld1q_dup_f32(mtx_a0 + 5);
- a6 = vld1q_dup_f32(mtx_a0 + 6);
- a7 = vld1q_dup_f32(mtx_a0 + 7);
-
- // 4x4 block 1
- acc01 = vmlaq_f32(acc01, b10, a0);
- acc11 = vmlaq_f32(acc11, b10, a1);
- acc21 = vmlaq_f32(acc21, b10, a2);
- acc31 = vmlaq_f32(acc31, b10, a3);
-
- // 4x4 block 0
- acc00 = vmlaq_f32(acc00, b01, a4);
- acc10 = vmlaq_f32(acc10, b01, a5);
- acc20 = vmlaq_f32(acc20, b01, a6);
- acc30 = vmlaq_f32(acc30, b01, a7);
-
- // 4x4 block 1
- acc01 = vmlaq_f32(acc01, b11, a4);
- acc11 = vmlaq_f32(acc11, b11, a5);
- acc21 = vmlaq_f32(acc21, b11, a6);
- acc31 = vmlaq_f32(acc31, b11, a7);
-
- mtx_a0 += 8;
- mtx_b0 += 8;
- mtx_b1 += 8;
- }
-
- for(; mtx_b0 < mtx_b0_end_addr;)
- {
- float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
- float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
- float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
- float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
- float32x4_t b00 = vld1q_f32(mtx_b0);
- float32x4_t b10 = vld1q_f32(mtx_b1);
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ a4 = vld1q_dup_f32(mtx_a0 + 4);
+ a5 = vld1q_dup_f32(mtx_a0 + 5);
+ a6 = vld1q_dup_f32(mtx_a0 + 6);
+ a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b01, a4);
+ acc10 = vmlaq_f32(acc10, b01, a5);
+ acc20 = vmlaq_f32(acc20, b01, a6);
+ acc30 = vmlaq_f32(acc30, b01, a7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b11, a4);
+ acc11 = vmlaq_f32(acc11, b11, a5);
+ acc21 = vmlaq_f32(acc21, b11, a6);
+ acc31 = vmlaq_f32(acc31, b11, a7);
+
+ mtx_a0 += 8;
+ mtx_b0 += 8;
+ mtx_b1 += 8;
+
+ a0 = vld1q_dup_f32(mtx_a0 + 0);
+ a1 = vld1q_dup_f32(mtx_a0 + 1);
+ a2 = vld1q_dup_f32(mtx_a0 + 2);
+ a3 = vld1q_dup_f32(mtx_a0 + 3);
+ b00 = vld1q_f32(mtx_b0);
+ b10 = vld1q_f32(mtx_b1);
+ b01 = vld1q_f32(mtx_b0 + 4);
+ b11 = vld1q_f32(mtx_b1 + 4);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ a4 = vld1q_dup_f32(mtx_a0 + 4);
+ a5 = vld1q_dup_f32(mtx_a0 + 5);
+ a6 = vld1q_dup_f32(mtx_a0 + 6);
+ a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
+
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b01, a4);
+ acc10 = vmlaq_f32(acc10, b01, a5);
+ acc20 = vmlaq_f32(acc20, b01, a6);
+ acc30 = vmlaq_f32(acc30, b01, a7);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b11, a4);
+ acc11 = vmlaq_f32(acc11, b11, a5);
+ acc21 = vmlaq_f32(acc21, b11, a6);
+ acc31 = vmlaq_f32(acc31, b11, a7);
+
+ mtx_a0 += 8;
+ mtx_b0 += 8;
+ mtx_b1 += 8;
+ }
+
+ for (; mtx_b0 < mtx_b0_end_addr;)
+ {
+ float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+ float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+ float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+ float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+ float32x4_t b00 = vld1q_f32(mtx_b0);
+ float32x4_t b10 = vld1q_f32(mtx_b1);
#if __arm__
- asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
- asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
- asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+ asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+ asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+ asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
#endif /* __arm__ */
- // 4x4 block 0
- acc00 = vmlaq_f32(acc00, b00, a0);
- acc10 = vmlaq_f32(acc10, b00, a1);
- acc20 = vmlaq_f32(acc20, b00, a2);
- acc30 = vmlaq_f32(acc30, b00, a3);
-
- // 4x4 block 1
- acc01 = vmlaq_f32(acc01, b10, a0);
- acc11 = vmlaq_f32(acc11, b10, a1);
- acc21 = vmlaq_f32(acc21, b10, a2);
- acc31 = vmlaq_f32(acc31, b10, a3);
-
- mtx_a0 += 4;
- mtx_b0 += 4;
- mtx_b1 += 4;
- }
-
- // Multiply by the weight of matrix product (alpha)
- if(multiply_alpha)
- {
- acc00 = vmulq_f32(acc00, alpha_f32);
- acc10 = vmulq_f32(acc10, alpha_f32);
- acc20 = vmulq_f32(acc20, alpha_f32);
- acc30 = vmulq_f32(acc30, alpha_f32);
- acc01 = vmulq_f32(acc01, alpha_f32);
- acc11 = vmulq_f32(acc11, alpha_f32);
- acc21 = vmulq_f32(acc21, alpha_f32);
- acc31 = vmulq_f32(acc31, alpha_f32);
- }
-
- const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
- const auto mtx_out1 = mtx_out0 + 4;
-
- if(id.x() < (out_width - 8))
- {
- vst1q_f32(mtx_out0, acc00);
- vst1q_f32(mtx_out1, acc01);
- if(id.y() + 1 < out_height)
+ // 4x4 block 0
+ acc00 = vmlaq_f32(acc00, b00, a0);
+ acc10 = vmlaq_f32(acc10, b00, a1);
+ acc20 = vmlaq_f32(acc20, b00, a2);
+ acc30 = vmlaq_f32(acc30, b00, a3);
+
+ // 4x4 block 1
+ acc01 = vmlaq_f32(acc01, b10, a0);
+ acc11 = vmlaq_f32(acc11, b10, a1);
+ acc21 = vmlaq_f32(acc21, b10, a2);
+ acc31 = vmlaq_f32(acc31, b10, a3);
+
+ mtx_a0 += 4;
+ mtx_b0 += 4;
+ mtx_b1 += 4;
+ }
+
+ // Multiply by the weight of matrix product (alpha)
+ if (multiply_alpha)
+ {
+ acc00 = vmulq_f32(acc00, alpha_f32);
+ acc10 = vmulq_f32(acc10, alpha_f32);
+ acc20 = vmulq_f32(acc20, alpha_f32);
+ acc30 = vmulq_f32(acc30, alpha_f32);
+ acc01 = vmulq_f32(acc01, alpha_f32);
+ acc11 = vmulq_f32(acc11, alpha_f32);
+ acc21 = vmulq_f32(acc21, alpha_f32);
+ acc31 = vmulq_f32(acc31, alpha_f32);
+ }
+
+ const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
+ const auto mtx_out1 = mtx_out0 + 4;
+
+ if (id.x() < (out_width - 8))
{
- vst1q_f32(mtx_out0 + out_stride1, acc10);
- vst1q_f32(mtx_out1 + out_stride1, acc11);
- if(id.y() + 2 < out_height)
+ vst1q_f32(mtx_out0, acc00);
+ vst1q_f32(mtx_out1, acc01);
+ if (id.y() + 1 < out_height)
{
- vst1q_f32(mtx_out0 + out_stride2, acc20);
- vst1q_f32(mtx_out1 + out_stride2, acc21);
- if(id.y() + 3 < out_height)
+ vst1q_f32(mtx_out0 + out_stride1, acc10);
+ vst1q_f32(mtx_out1 + out_stride1, acc11);
+ if (id.y() + 2 < out_height)
{
- vst1q_f32(mtx_out0 + out_stride3, acc30);
- vst1q_f32(mtx_out1 + out_stride3, acc31);
+ vst1q_f32(mtx_out0 + out_stride2, acc20);
+ vst1q_f32(mtx_out1 + out_stride2, acc21);
+ if (id.y() + 3 < out_height)
+ {
+ vst1q_f32(mtx_out0 + out_stride3, acc30);
+ vst1q_f32(mtx_out1 + out_stride3, acc31);
+ }
}
}
}
- }
- else if(id.x() < (out_width - 4))
- {
- vst1q_f32(mtx_out0, acc00);
- if(id.y() + 1 < out_height)
+ else if (id.x() < (out_width - 4))
{
- vst1q_f32(mtx_out0 + out_stride1, acc10);
- if(id.y() + 2 < out_height)
+ vst1q_f32(mtx_out0, acc00);
+ if (id.y() + 1 < out_height)
{
- vst1q_f32(mtx_out0 + out_stride2, acc20);
- if(id.y() + 3 < out_height)
+ vst1q_f32(mtx_out0 + out_stride1, acc10);
+ if (id.y() + 2 < out_height)
{
- vst1q_f32(mtx_out0 + out_stride3, acc30);
+ vst1q_f32(mtx_out0 + out_stride2, acc20);
+ if (id.y() + 3 < out_height)
+ {
+ vst1q_f32(mtx_out0 + out_stride3, acc30);
+ }
}
}
- }
- // Left-over columns
- const int columns_left = out_width - id.x() - 4;
- for(auto x = 0; x < columns_left; ++x)
- {
- *(mtx_out1 + x) = acc01[x];
- if(id.y() + 1 < out_height)
+ // Left-over columns
+ const int columns_left = out_width - id.x() - 4;
+ for (auto x = 0; x < columns_left; ++x)
{
- *(mtx_out1 + x + out_stride1) = acc11[x];
- if(id.y() + 2 < out_height)
+ *(mtx_out1 + x) = acc01[x];
+ if (id.y() + 1 < out_height)
{
- *(mtx_out1 + x + out_stride2) = acc21[x];
- if(id.y() + 3 < out_height)
+ *(mtx_out1 + x + out_stride1) = acc11[x];
+ if (id.y() + 2 < out_height)
{
- *(mtx_out1 + x + out_stride3) = acc31[x];
+ *(mtx_out1 + x + out_stride2) = acc21[x];
+ if (id.y() + 3 < out_height)
+ {
+ *(mtx_out1 + x + out_stride3) = acc31[x];
+ }
}
}
}
}
- }
- else
- {
- // Left-over columns
- const int columns_left = out_width - id.x();
- for(int x = 0; x < columns_left; ++x)
+ else
{
- *(mtx_out0 + x) = acc00[x];
- if(id.y() + 1 < out_height)
+ // Left-over columns
+ const int columns_left = out_width - id.x();
+ for (int x = 0; x < columns_left; ++x)
{
- *(mtx_out0 + x + out_stride1) = acc10[x];
- if(id.y() + 2 < out_height)
+ *(mtx_out0 + x) = acc00[x];
+ if (id.y() + 1 < out_height)
{
- *(mtx_out0 + x + out_stride2) = acc20[x];
- if(id.y() + 3 < out_height)
+ *(mtx_out0 + x + out_stride1) = acc10[x];
+ if (id.y() + 2 < out_height)
{
- *(mtx_out0 + x + out_stride3) = acc30[x];
+ *(mtx_out0 + x + out_stride2) = acc20[x];
+ if (id.y() + 3 < out_height)
+ {
+ *(mtx_out0 + x + out_stride3) = acc30[x];
+ }
}
}
}
}
- }
- },
- ina, inb, out);
+ },
+ ina, inb, out);
}
} // namespace cpu
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
index f9f1f247ac..74ea4c2b17 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
@@ -24,15 +24,18 @@
#ifndef SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
#define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
#include "arm_compute/core/Helpers.h"
+
#include "src/core/CPP/Validate.h"
namespace arm_compute
{
namespace cpu
{
-void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+void vector_matrix_multiply_f32(
+ const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
-void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+void matrix_matrix_multiply_f32(
+ const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/list.h b/src/cpu/kernels/gemm_matrix_mul/list.h
index 9cdb58ae06..15b23b1d81 100644
--- a/src/cpu/kernels/gemm_matrix_mul/list.h
+++ b/src/cpu/kernels/gemm_matrix_mul/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \
- void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \
+ void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, \
+ float alpha, const bool is_dst_vector)
DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp32_gemm_matrix_mul);
DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp16_gemm_matrix_mul);
#undef DECLARE_GEMMMATRIXMUL_KERNEL
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
index d4e469b691..4ed7e54f1c 100644
--- a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
@@ -27,10 +27,13 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_fp16_computeallanchors(const ITensor *anchors,
+ ITensor *all_anchors,
+ ComputeAnchorsInfo anchors_info,
+ const Window &window)
{
return compute_all_anchors<float16_t>(anchors, all_anchors, anchors_info, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
index 09aa6ecec4..f15cd63bb2 100644
--- a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
@@ -26,9 +26,12 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp32_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_fp32_computeallanchors(const ITensor *anchors,
+ ITensor *all_anchors,
+ ComputeAnchorsInfo anchors_info,
+ const Window &window)
{
return compute_all_anchors<float>(anchors, all_anchors, anchors_info, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
index 9224e32a94..8cb76f3afb 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
@@ -28,7 +28,10 @@ class ITensor;
class Window;
namespace cpu
{
-void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void compute_all_anchors_qasymm16(const ITensor *anchors,
+ ITensor *all_anchors,
+ ComputeAnchorsInfo anchors_info,
+ const Window &window)
{
Iterator all_anchors_it(all_anchors, window);
Iterator anchors_it(all_anchors, window);
@@ -39,28 +42,30 @@ void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors,
const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const size_t anchor_offset = id.y() % num_anchors;
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const size_t anchor_offset = id.y() % num_anchors;
- const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
- const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+ const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
+ const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
- const size_t shift_idy = id.y() / num_anchors;
- const float shiftx = (shift_idy % feat_width) * stride;
- const float shifty = (shift_idy / feat_width) * stride;
+ const size_t shift_idy = id.y() / num_anchors;
+ const float shiftx = (shift_idy % feat_width) * stride;
+ const float shifty = (shift_idy / feat_width) * stride;
- const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
- const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
- const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
- const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
+ const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
+ const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
+ const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
+ const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
- *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale);
- *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
- *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
- *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
- },
- all_anchors_it);
+ *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale);
+ *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
+ *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
+ *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
+ },
+ all_anchors_it);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h
index da052c9192..3317bcfbe6 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.h
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.h
@@ -26,13 +26,17 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace cpu
{
template <typename T>
-void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void compute_all_anchors(const ITensor *anchors,
+ ITensor *all_anchors,
+ ComputeAnchorsInfo anchors_info,
+ const Window &window)
{
Iterator all_anchors_it(all_anchors, window);
Iterator anchors_it(all_anchors, window);
@@ -41,26 +45,31 @@ void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAn
const T stride = 1.f / anchors_info.spatial_scale();
const size_t feat_width = anchors_info.feat_width();
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const size_t anchor_offset = id.y() % num_anchors;
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const size_t anchor_offset = id.y() % num_anchors;
- const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
- const auto anchor_ptr = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+ const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
+ const auto anchor_ptr = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
- const size_t shift_idy = id.y() / num_anchors;
- const T shiftx = (shift_idy % feat_width) * stride;
- const T shifty = (shift_idy / feat_width) * stride;
+ const size_t shift_idy = id.y() / num_anchors;
+ const T shiftx = (shift_idy % feat_width) * stride;
+ const T shifty = (shift_idy / feat_width) * stride;
- *out_anchor_ptr = *anchor_ptr + shiftx;
- *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
- *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
- *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
- },
- all_anchors_it);
+ *out_anchor_ptr = *anchor_ptr + shiftx;
+ *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
+ *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
+ *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
+ },
+ all_anchors_it);
}
-void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+void compute_all_anchors_qasymm16(const ITensor *anchors,
+ ITensor *all_anchors,
+ ComputeAnchorsInfo anchors_info,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
#endif //define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
diff --git a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
index cfb5a41d6e..7182d0b27d 100644
--- a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
@@ -26,9 +26,12 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qu16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_qu16_computeallanchors(const ITensor *anchors,
+ ITensor *all_anchors,
+ ComputeAnchorsInfo anchors_info,
+ const Window &window)
{
return compute_all_anchors_qasymm16(anchors, all_anchors, anchors_info, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
index 2b7d91b144..44418c0bb9 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
@@ -40,7 +41,10 @@ void vector_float_sum_fp16(AccType &result, AccType &result_square, const InputT
}
template <typename InputType, typename AccType>
-InputType vector_float_norm_fp16(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+InputType vector_float_norm_fp16(const InputType &inputs,
+ const AccType &vec_mean,
+ const AccType &vec_multip,
+ const AccType &vec_beta)
{
return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
}
@@ -52,19 +56,24 @@ inline void vector_float_sum_fp16(float32x4_t &result, float32x4_t &result_squar
vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
}
template <>
-inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
+inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs,
+ const float32x4_t &vec_mean,
+ const float32x4_t &vec_multip,
+ const float32x4_t &vec_beta)
{
- const auto input_low = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
- const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
- const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta));
- const auto result_high = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta));
- float16x8_t result = wrapper::vcombine(result_low, result_high);
+ const auto input_low = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
+ const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
+ const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta));
+ const auto result_high =
+ wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta));
+ float16x8_t result = wrapper::vcombine(result_low, result_high);
return result;
}
template <typename AccType>
-void instance_normalization_nchw_fp16(const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+void instance_normalization_nchw_fp16(
+ const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
{
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
@@ -78,91 +87,105 @@ void instance_normalization_nchw_fp16(const ITensor *input, ITensor *output, flo
const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
Iterator input_it(input, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- Window win_plane = window;
- win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
- win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
- win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
- Iterator input_plane_it(input, win_plane);
- Iterator output_plane_it(output, win_plane);
-
- auto sum_h_w = static_cast<AccType>(0.f);
- auto sum_squares_h_w = static_cast<AccType>(0.f);
-
- execute_window_loop(win_plane, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
-
- auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
- auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-
- // Compute S elements per iteration
- int x = window.x().start();
- for(; x <= (window.x().end() - window_step_x); x += window_step_x)
- {
- auto vec_input_val = wrapper::vloadq(input_ptr + x);
- vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
- }
-
- auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
- auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-
- vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
- vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-
- sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
- sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
- // Compute left-over elements
- for(; x < window.x().end(); ++x)
- {
- const auto value = static_cast<AccType>(*(input_ptr + x));
- sum_h_w += value;
- sum_squares_h_w += value * value;
- }
- },
- input_plane_it, output_plane_it);
-
- const auto mean_h_w = sum_h_w / elements_plane;
- const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
- const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon);
- const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
- const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
- const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
- execute_window_loop(win_plane, [&](const Coordinates &)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
- auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr());
-
- // Compute S elements per iteration
- int x = window.x().start();
- for(; x <= (window.x().end() - window_step_x); x += window_step_x)
- {
- const auto vec_val = wrapper::vloadq(input_ptr + x);
- const auto normalized_vec = vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
- wrapper::vstore(output_ptr + x, normalized_vec);
- }
-
- // Compute left-over elements
- for(; x < window.x().end(); ++x)
- {
- const auto val = static_cast<AccType>(*(input_ptr + x));
- *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta);
- }
+ Window win_plane = window;
+ win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+ win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+ win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+ Iterator input_plane_it(input, win_plane);
+ Iterator output_plane_it(output, win_plane);
+
+ auto sum_h_w = static_cast<AccType>(0.f);
+ auto sum_squares_h_w = static_cast<AccType>(0.f);
+
+ execute_window_loop(
+ win_plane,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+
+ auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+ auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+
+ // Compute S elements per iteration
+ int x = window.x().start();
+ for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+ {
+ auto vec_input_val = wrapper::vloadq(input_ptr + x);
+ vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+ }
+
+ auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+ auto vec2_sum_squares_h_w =
+ wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+ vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+ vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+ sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+ sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+ // Compute left-over elements
+ for (; x < window.x().end(); ++x)
+ {
+ const auto value = static_cast<AccType>(*(input_ptr + x));
+ sum_h_w += value;
+ sum_squares_h_w += value * value;
+ }
+ },
+ input_plane_it, output_plane_it);
+
+ const auto mean_h_w = sum_h_w / elements_plane;
+ const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+ const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon);
+ const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+ const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+ const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+ execute_window_loop(
+ win_plane,
+ [&](const Coordinates &)
+ {
+ auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+ auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr());
+
+ // Compute S elements per iteration
+ int x = window.x().start();
+ for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+ {
+ const auto vec_val = wrapper::vloadq(input_ptr + x);
+ const auto normalized_vec =
+ vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+ wrapper::vstore(output_ptr + x, normalized_vec);
+ }
+
+ // Compute left-over elements
+ for (; x < window.x().end(); ++x)
+ {
+ const auto val = static_cast<AccType>(*(input_ptr + x));
+ *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta);
+ }
+ },
+ input_plane_it, output_plane_it);
},
- input_plane_it, output_plane_it);
- },
- input_it);
-}
+ input_it);
}
-
-void neon_fp16_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+} // namespace
+
+void neon_fp16_instancenorm(ITensor *input,
+ ITensor *output,
+ float gamma,
+ float beta,
+ float epsilon,
+ bool use_mixed_precision,
+ const Window &window)
{
- if(use_mixed_precision)
+ if (use_mixed_precision)
{
return instance_normalization_nchw_fp16<float>(input, output, gamma, beta, epsilon, window);
}
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
index 061dd9585c..e1ca05518d 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
@@ -26,7 +26,13 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp32_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+void neon_fp32_instancenorm(ITensor *input,
+ ITensor *output,
+ float gamma,
+ float beta,
+ float epsilon,
+ bool use_mixed_precision,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(use_mixed_precision);
return instance_normalization_nchw<float>(input, output, gamma, beta, epsilon, window);
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
index 483b6f568b..515079e1b5 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -38,13 +39,15 @@ void vector_float_sum(AccType &result, AccType &result_square, const InputType &
}
template <typename InputType, typename AccType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
{
return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
}
template <typename T, typename AccType>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+void instance_normalization_nchw(
+ ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
{
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
@@ -58,88 +61,96 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, f
const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
Iterator input_it(input, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- Window win_plane = window;
- win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
- win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
- win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
- Iterator input_plane_it(input, win_plane);
- Iterator output_plane_it(output, win_plane);
-
- auto sum_h_w = static_cast<AccType>(0.f);
- auto sum_squares_h_w = static_cast<AccType>(0.f);
-
- execute_window_loop(win_plane, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
- auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
- auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-
- // Compute S elements per iteration
- int x = window.x().start();
- for(; x <= (window.x().end() - window_step_x); x += window_step_x)
- {
- auto vec_input_val = wrapper::vloadq(input_ptr + x);
- vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
- }
-
- auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
- auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-
- vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
- vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-
- sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
- sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
- // Compute left-over elements
- for(; x < window.x().end(); ++x)
- {
- const auto value = static_cast<AccType>(*(input_ptr + x));
- sum_h_w += value;
- sum_squares_h_w += value * value;
- }
- },
- input_plane_it, output_plane_it);
-
- const auto mean_h_w = sum_h_w / elements_plane;
- const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
- const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon);
- const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
- const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
- const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
- execute_window_loop(win_plane, [&](const Coordinates &)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
- auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
- // Compute S elements per iteration
- int x = window.x().start();
- //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
- for(; x <= (window.x().end() - window_step_x); x += window_step_x)
- {
- const auto vec_val = wrapper::vloadq(input_ptr + x);
- const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
- wrapper::vstore(output_ptr + x, normalized_vec);
- }
-
- // Compute left-over elements
- for(; x < window.x().end(); ++x)
- {
- const auto val = static_cast<AccType>(*(input_ptr + x));
- *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
- }
+ Window win_plane = window;
+ win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+ win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+ win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+ Iterator input_plane_it(input, win_plane);
+ Iterator output_plane_it(output, win_plane);
+
+ auto sum_h_w = static_cast<AccType>(0.f);
+ auto sum_squares_h_w = static_cast<AccType>(0.f);
+
+ execute_window_loop(
+ win_plane,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+ auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+ auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+
+ // Compute S elements per iteration
+ int x = window.x().start();
+ for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+ {
+ auto vec_input_val = wrapper::vloadq(input_ptr + x);
+ vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+ }
+
+ auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+ auto vec2_sum_squares_h_w =
+ wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+ vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+ vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+ sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+ sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+ // Compute left-over elements
+ for (; x < window.x().end(); ++x)
+ {
+ const auto value = static_cast<AccType>(*(input_ptr + x));
+ sum_h_w += value;
+ sum_squares_h_w += value * value;
+ }
+ },
+ input_plane_it, output_plane_it);
+
+ const auto mean_h_w = sum_h_w / elements_plane;
+ const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+ const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon);
+ const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+ const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+ const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+ execute_window_loop(
+ win_plane,
+ [&](const Coordinates &)
+ {
+ auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
+ auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+ // Compute S elements per iteration
+ int x = window.x().start();
+ //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+ for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+ {
+ const auto vec_val = wrapper::vloadq(input_ptr + x);
+ const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+ wrapper::vstore(output_ptr + x, normalized_vec);
+ }
+
+ // Compute left-over elements
+ for (; x < window.x().end(); ++x)
+ {
+ const auto val = static_cast<AccType>(*(input_ptr + x));
+ *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
+ }
+ },
+ input_plane_it, output_plane_it);
},
- input_plane_it, output_plane_it);
- },
- input_it);
+ input_it);
}
-template void instance_normalization_nchw<float>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+template void instance_normalization_nchw<float>(
+ ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.h b/src/cpu/kernels/instancenorm/generic/neon/impl.h
index 0ddfcdd5ba..e1cc7487f7 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/impl.h
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.h
@@ -32,13 +32,15 @@ namespace arm_compute
namespace cpu
{
template <typename T, typename AccType = T>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+void instance_normalization_nchw(
+ ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
template <typename InputType, typename AccType = InputType>
void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs);
template <typename InputType, typename AccType = InputType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
} // namespace cpu
} // namespace arm_compute
#endif //define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H
diff --git a/src/cpu/kernels/instancenorm/list.h b/src/cpu/kernels/instancenorm/list.h
index 54f1d3213f..51b496c41d 100644
--- a/src/cpu/kernels/instancenorm/list.h
+++ b/src/cpu/kernels/instancenorm/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_INSTANCENORM_KERNEL(func_name) \
- void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+#define DECLARE_INSTANCENORM_KERNEL(func_name) \
+ void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, \
+ const Window &window)
DECLARE_INSTANCENORM_KERNEL(neon_fp32_instancenorm);
DECLARE_INSTANCENORM_KERNEL(neon_fp16_instancenorm);
#undef DECLARE_INSTANCENORM_KERNEL
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
index b503a8b734..32d9ca4eac 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
@@ -24,18 +24,17 @@
#include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/AssemblyUtils.h"
-
#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "src/core/utils/AssemblyUtils.h"
#include "depthwise_common.hpp"
-
#include <arm_neon.h>
namespace arm_compute
@@ -54,9 +53,13 @@ constexpr unsigned int idx_channels = 0;
constexpr unsigned int idx_batches = 3;
template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
- const ConvolutionInfo &info, const CPUInfo &cpu_info,
- std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, std::string &_name)
+void create_arm_dwc(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info,
+ const CPUInfo &cpu_info,
+ std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
+ std::string &_name)
{
unsigned int stride_cols{};
unsigned int stride_rows{};
@@ -79,13 +82,13 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI
const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
- arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols,
- n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
- padding, activation, nullptr);
+ arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+ dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+ dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
// Configure assembly pooling kernel
auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
- if(dwc_kernel_asm == nullptr)
+ if (dwc_kernel_asm == nullptr)
{
// Configuration not supported: Leave function unconfigured:
return;
@@ -96,11 +99,16 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI
}
template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
- const ConvolutionInfo &info, const CPUInfo &cpu_info,
+void create_arm_dwc_quant(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info,
+ const CPUInfo &cpu_info,
std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
- std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts,
- std::string &_name)
+ std::vector<int32_t> &multipliers,
+ std::vector<int32_t> &right_shifts,
+ std::vector<int32_t> &left_shifts,
+ std::string &_name)
{
unsigned int stride_cols{};
unsigned int stride_rows{};
@@ -123,9 +131,9 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
- arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols,
- n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
- padding, activation, nullptr);
+ arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+ dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+ dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
const auto src_qinfo = src->quantization_info().uniform();
const auto weights_qinfo = weights->quantization_info();
@@ -135,64 +143,50 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
multipliers.resize(num_filters);
std::vector<int32_t> dst_shifts(num_filters);
- quantization::compute_quantized_multipliers_and_shifts(src,
- weights,
- dst,
- multipliers.data(),
- dst_shifts.data());
+ quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, multipliers.data(), dst_shifts.data());
// Quantize activation bounds
int32_t min_activation = std::numeric_limits<TSrc>::lowest();
int32_t max_activation = std::numeric_limits<TSrc>::max();
- if(info.act_info.enabled())
+ if (info.act_info.enabled())
{
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
+ std::tie(min_activation, max_activation) =
+ get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
}
// Set quantization parameters for assembly kernels
arm_gemm::Requantize32 requant_args{};
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
left_shifts.resize(num_filters);
right_shifts.resize(num_filters);
bool need_left_shift = false; // Select more optimized path if left shift is not needed
- for(unsigned int i = 0; i < num_filters; ++i)
+ for (unsigned int i = 0; i < num_filters; ++i)
{
left_shifts[i] = std::max(-dst_shifts[i], static_cast<int32_t>(0));
right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
- if(dst_shifts[i] < 0 && !need_left_shift)
+ if (dst_shifts[i] < 0 && !need_left_shift)
{
need_left_shift = true;
}
}
- requant_args = arm_gemm::Requantize32(nullptr,
- 0,
- src_qinfo.offset,
- weights_qinfo.uniform().offset,
- dst_qinfo.offset,
- (need_left_shift) ? left_shifts.data() : nullptr,
- right_shifts.data(),
- multipliers.data(),
- static_cast<TSrc>(min_activation),
- static_cast<TSrc>(max_activation));
+ requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+ dst_qinfo.offset, (need_left_shift) ? left_shifts.data() : nullptr,
+ right_shifts.data(), multipliers.data(),
+ static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
}
else
{
- requant_args = arm_gemm::Requantize32(nullptr,
- 0,
- src_qinfo.offset,
- weights_qinfo.uniform().offset,
- dst_qinfo.offset,
- -dst_shifts[0],
- multipliers[0],
- static_cast<TSrc>(min_activation),
- static_cast<TSrc>(max_activation));
+ requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+ dst_qinfo.offset, -dst_shifts[0], multipliers[0],
+ static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
}
// Configure assembly pooling kernel with requantization
- auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
- if(dwc_kernel_asm == nullptr)
+ auto dwc_kernel_asm =
+ arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
+ if (dwc_kernel_asm == nullptr)
{
// Configuration not supported: Leave function unconfigured:
return;
@@ -203,18 +197,18 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
} // namespace
CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel()
- : _kernel_asm(nullptr),
- _multipliers(),
- _left_shifts(),
- _right_shifts(),
- _name()
+ : _kernel_asm(nullptr), _multipliers(), _left_shifts(), _right_shifts(), _name()
{
}
CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default;
-void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst,
- const ConvolutionInfo &info, const CPUInfo &cpu_info)
+void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info,
+ const CPUInfo &cpu_info)
{
ARM_COMPUTE_UNUSED(cpu_info);
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -225,24 +219,30 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
_name = "CpuDepthwiseConv2dAssemblyWrapperKernel";
std::string asm_kernel_name("");
#if defined(__aarch64__)
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::QASYMM8:
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
- create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+ create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+ _multipliers, _right_shifts, _left_shifts,
+ asm_kernel_name);
}
else
{
- create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+ create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+ _multipliers, _right_shifts, _left_shifts,
+ asm_kernel_name);
}
break;
case DataType::QASYMM8_SIGNED:
- create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+ create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers,
+ _right_shifts, _left_shifts, asm_kernel_name);
break;
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
case DataType::F16:
- create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
+ create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+ asm_kernel_name);
break;
#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
case DataType::F32:
@@ -255,13 +255,17 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
Window win = calculate_max_window(*dst, Steps());
ICpuKernel::configure(win);
- if(_kernel_asm != nullptr)
+ if (_kernel_asm != nullptr)
{
_name += "/" + asm_kernel_name;
}
}
-Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
@@ -269,10 +273,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
#endif // !defined(__aarch64__)
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC,
+ "Only NHWC is supported by assembly kernels");
- if(is_data_type_quantized_per_channel(weights->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
@@ -282,12 +288,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
}
- if(bias != nullptr)
+ if (bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
- if(is_data_type_quantized(src->data_type()))
+ if (is_data_type_quantized(src->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
}
@@ -297,7 +303,7 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
}
}
- if(dst->total_size() > 0)
+ if (dst->total_size() > 0)
{
const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -305,17 +311,15 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
}
// Assembly kernels cannot work with padding greater than the kernel.
- const auto &padding = info.pad_stride_info;
- const auto &dilation = info.dilation;
+ const auto &padding = info.pad_stride_info;
+ const auto &dilation = info.dilation;
const auto &wei_shape = weights->tensor_shape();
const auto dilated_wei_w = wei_shape[1] + (wei_shape[1] - 1) * (dilation.x() - 1);
const auto dilated_wei_h = wei_shape[2] + (wei_shape[2] - 1) * (dilation.y() - 1);
- ARM_COMPUTE_RETURN_ERROR_ON(
- padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
- padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h
- );
+ ARM_COMPUTE_RETURN_ERROR_ON(padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
+ padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h);
return Status{};
}
@@ -351,13 +355,12 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const
const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
- _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
- parameters_ptr,
- dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
- working_space, info.thread_id, info.num_threads);
+ _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, parameters_ptr, dst_ptr, ld_dst_col, ld_dst_row,
+ ld_dst_batch, working_space, info.thread_id, info.num_threads);
}
-void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
+void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(
+ void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
{
_kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
}
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
index f61cb1b09c..fadaefb999 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/kernels/CpuKernelSelectionTypes.h"
@@ -35,8 +36,8 @@ namespace depthwise
{
// Forward declarations
class IDepthwiseCommon;
-} // depthwise
-} // arm_conv
+} // namespace depthwise
+} // namespace arm_conv
namespace arm_compute
{
@@ -66,7 +67,12 @@ public:
* @param[in] info Depthwise convolution layer meta-data.
* @param[in] cpu_info CPU information needed to select the most appropriate kernel.
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info);
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info,
+ const CPUInfo &cpu_info);
/** Indicates whether or not this function can be used to process the given parameters.
*
@@ -74,10 +80,14 @@ public:
*
* @return a status.
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
/** Pack bias and weights in a storage space for the assembly kernel
@@ -88,7 +98,8 @@ public:
* @param[in] ld_weights_col Columns displacement for the weights tensor.
* @param[in] ld_weights_row Rows displacement for the weights tensor.
*/
- void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
+ void pack_parameters(
+ void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
/** Get the amount of storage space required for the rearranged weights and bias.
*
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index 10ff4183c0..a161c800fd 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -22,14 +22,16 @@
* SOFTWARE.
*/
#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
#include <arm_neon.h>
@@ -41,7 +43,10 @@ namespace kernels
{
using namespace arm_compute::misc::shape_calculator;
-void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &info,
+ const CPUInfo &cpu_info)
{
ARM_COMPUTE_UNUSED(cpu_info);
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -52,10 +57,10 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn
#if defined(__aarch64__)
const bool requantize = src->quantization_info() != dst->quantization_info();
- switch(src->data_type())
+ switch (src->data_type())
{
case DataType::QASYMM8:
- if(requantize)
+ if (requantize)
{
create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
}
@@ -65,7 +70,7 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn
}
break;
case DataType::QASYMM8_SIGNED:
- if(requantize)
+ if (requantize)
{
create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
}
@@ -91,7 +96,8 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn
INEKernel::configure(win);
}
-Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+Status
+CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
@@ -99,43 +105,52 @@ Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const IT
ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
#endif /* __aarch64__ */
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC),
+ "Only NHWC is supported by assembly kernels");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
"Only AVG and MAX pooling are supported by assembly kernels");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(info), "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ is_pool_region_entirely_outside_input(info),
+ "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
- if(dst->total_size() > 0)
+ if (dst->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
const auto src_qinfo = src->quantization_info().uniform();
const auto dst_qinfo = dst->quantization_info().uniform();
- if(src_qinfo != dst_qinfo)
+ if (src_qinfo != dst_qinfo)
{
const float multiplier = src_qinfo.scale / dst_qinfo.scale;
int32_t dst_multiplier{};
int32_t dst_shift{};
- ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
}
else
{
- if(src->data_type() == DataType::QASYMM8)
+ if (src->data_type() == DataType::QASYMM8)
{
const bool has_padding = info.pad_stride_info.has_padding();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !info.exclude_padding && has_padding,
+ "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
}
}
}
else
{
- if(src->data_type() == DataType::QASYMM8)
+ if (src->data_type() == DataType::QASYMM8)
{
// If dst is not configured, the quantization info are the same
const bool has_padding = info.pad_stride_info.has_padding();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !info.exclude_padding && has_padding,
+ "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
}
}
return Status{};
@@ -154,9 +169,10 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
- const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
- auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
- auto working_space = (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+ const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
+ auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+ auto working_space =
+ (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
const auto src_shape = src->info()->tensor_shape();
const auto dst_shape = dst->info()->tensor_shape();
@@ -170,8 +186,7 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &
const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
- _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
- out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
+ _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
working_space, info.thread_id, info.num_threads);
}
@@ -186,9 +201,14 @@ bool CpuPool2dAssemblyWrapperKernel::is_configured() const
}
template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &info,
+ const CPUInfo &cpu_info)
{
- const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+ const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+ ? arm_conv::pooling::PoolingType::AVERAGE
+ : arm_conv::pooling::PoolingType::MAX;
arm_conv::pooling::PoolingWindow window{};
window.cols = static_cast<unsigned int>(info.pool_size.x());
@@ -197,7 +217,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
arm_conv::pooling::PoolingStride stride{};
std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
- const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+ const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+ info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
constexpr unsigned int idx_width = 1;
constexpr unsigned int idx_height = 2;
@@ -211,11 +232,12 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
const unsigned int dst_rows = dst->dimension(idx_height);
const unsigned int dst_cols = dst->dimension(idx_width);
- arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+ arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+ src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
// Configure assembly pooling kernel
auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
- if(pooling_kernel_asm == nullptr)
+ if (pooling_kernel_asm == nullptr)
{
// Configuration not supported: Leave function unconfigured:
return;
@@ -225,9 +247,14 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
}
template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &info,
+ const CPUInfo &cpu_info)
{
- const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+ const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+ ? arm_conv::pooling::PoolingType::AVERAGE
+ : arm_conv::pooling::PoolingType::MAX;
arm_conv::pooling::PoolingWindow window{};
window.cols = static_cast<unsigned int>(info.pool_size.x());
@@ -236,7 +263,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
arm_conv::pooling::PoolingStride stride{};
std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
- const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+ const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+ info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
constexpr unsigned int idx_width = 1;
constexpr unsigned int idx_height = 2;
@@ -250,7 +278,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
const unsigned int dst_rows = dst->dimension(idx_height);
const unsigned int dst_cols = dst->dimension(idx_width);
- arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+ arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+ src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
const auto src_qinfo = src->quantization_info().uniform();
const auto dst_qinfo = dst->quantization_info().uniform();
@@ -260,15 +289,15 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
int32_t dst_shift{};
quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
- const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
- dst_qinfo.offset,
+ const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, dst_qinfo.offset,
dst_shift, // left shift
0, // right shift
dst_multiplier);
// Configure assembly pooling kernel with requantization
- auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
- if(pooling_kernel_asm == nullptr)
+ auto pooling_kernel_asm =
+ arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
+ if (pooling_kernel_asm == nullptr)
{
// Configuration not supported: Leave function unconfigured:
return;
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
index 8713d5c54d..b4ff1e6f2d 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -25,8 +25,9 @@
#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
#include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+
#include "src/core/common/Macros.h"
+#include "src/core/NEON/kernels/assembly/pooling.hpp"
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/kernels/CpuKernelSelectionTypes.h"
@@ -101,7 +102,8 @@ private:
* @param[in] info Pooling layer meta-data.
*/
template <typename Typesrc, typename Typedst>
- void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+ void
+ create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
/** Helper function to create the assembly kernel with requantization support
*
@@ -110,9 +112,12 @@ private:
* @param[in] info Pooling layer meta-data.
*/
template <typename Typesrc, typename Typedst>
- void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+ void create_arm_pooling_requant(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &info,
+ const CPUInfo &cpu_info);
- std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
+ std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{nullptr};
/** Return minimum workload size of the relevant kernel
*
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
index 661c3d7f46..6c6527de06 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
@@ -32,13 +32,15 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp16_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp16_l2_normalize_x(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
{
ARM_COMPUTE_UNUSED(unused_axis);
return l2_normalize_x<float16_t, 8>(in, sum, out, epsilon, window);
}
-void neon_fp16_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp16_l2_normalize_yz(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
{
return l2_normalize_yz<float16_t, 8>(in, sum, out, epsilon, window, axis);
}
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
index be32bdc4fa..520877068c 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
@@ -22,21 +22,23 @@
* SOFTWARE.
*/
-#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
-
#include "arm_compute/core/Helpers.h"
+#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
+
namespace arm_compute
{
namespace cpu
{
-void neon_fp32_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp32_l2_normalize_x(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
{
ARM_COMPUTE_UNUSED(unused_axis);
return l2_normalize_x<float, 4>(in, sum, out, epsilon, window);
}
-void neon_fp32_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp32_l2_normalize_yz(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
{
return l2_normalize_yz<float, 4>(in, sum, out, epsilon, window, axis);
}
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
index a06cdd33d3..6bd19299b7 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/impl.h
+++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
@@ -26,8 +26,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <cstddef>
@@ -51,33 +52,36 @@ void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float e
Iterator sum_it(sum, win_collapsed);
Iterator output_it(out, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
- const T sum_value = *reinterpret_cast<const T *>(sum_it.ptr());
- const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
- const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
-
- // Compute elements over vector steps
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- out_ptr[x] = in_ptr[x] * norm_value;
- }
- },
- input_it, sum_it, output_it);
+ const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+ const T sum_value = *reinterpret_cast<const T *>(sum_it.ptr());
+ const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
+ const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
+
+ // Compute elements over vector steps
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ out_ptr[x] = in_ptr[x] * norm_value;
+ }
+ },
+ input_it, sum_it, output_it);
}
template <typename T, int S>
-void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void l2_normalize_yz(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
{
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
@@ -97,28 +101,30 @@ void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float
const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
- const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
- // Compute elements over vector steps
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
- wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
- out_ptr[x] = in_ptr[x] * norm_value;
- }
- },
- input_it, sum_it, output_it);
+ const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr());
+ const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+ // Compute elements over vector steps
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
+ wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
+ out_ptr[x] = in_ptr[x] * norm_value;
+ }
+ },
+ input_it, sum_it, output_it);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/l2normlayer/list.h b/src/cpu/kernels/l2normlayer/list.h
index 2bad7f54f5..e2a879d06e 100644
--- a/src/cpu/kernels/l2normlayer/list.h
+++ b/src/cpu/kernels/l2normlayer/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_L2NORMLAYER_KERNEL(func_name) \
- void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+#define DECLARE_L2NORMLAYER_KERNEL(func_name) \
+ void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, \
+ size_t axis)
DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_x);
DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_yz);
diff --git a/src/cpu/kernels/lut/generic/neon/u8.cpp b/src/cpu/kernels/lut/generic/neon/u8.cpp
index 8ab647bfee..5516f5b33d 100644
--- a/src/cpu/kernels/lut/generic/neon/u8.cpp
+++ b/src/cpu/kernels/lut/generic/neon/u8.cpp
@@ -32,376 +32,374 @@ namespace cpu
#ifdef __aarch64__
void lut_u8_neon(
- const uint8_t *table,
- size_t num_strings,
- size_t string_length,
- const uint8_t *const *input,
- uint8_t *const *output)
+ const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
{
- __asm__ __volatile__(
- "ldr q16, [%x[table], #0x0]\n"
- "ldr q17, [%x[table], #0x10]\n"
- "mov x23, #0x0\n"
- "ldr q18, [%x[table], #0x20]\n"
- "ldr q19, [%x[table], #0x30]\n"
- "ldr q20, [%x[table], #0x40]\n"
- "ldr q21, [%x[table], #0x50]\n"
- "ldr q22, [%x[table], #0x60]\n"
- "ldr q23, [%x[table], #0x70]\n"
- "ldr q24, [%x[table], #0x80]\n"
- "ldr q25, [%x[table], #0x90]\n"
- "ldr q26, [%x[table], #0xa0]\n"
- "ldr q27, [%x[table], #0xb0]\n"
- "ldr q28, [%x[table], #0xc0]\n"
- "ldr q29, [%x[table], #0xd0]\n"
- "ldr q30, [%x[table], #0xe0]\n"
- "ldr q31, [%x[table], #0xf0]\n"
- "1:" // string loop
- "ldr x22, [%x[input], x23, LSL #0x3]\n"
- "ldr x21, [%x[output], x23, LSL #0x3]\n"
- "movi v11.16b, #0x40\n"
- "movi v10.16b, #0x80\n"
- "movi v9.16b, #0xc0\n"
- "mov x20, %x[string_length]\n"
- "2:" // 4 rounds: width loop
- "cmp x20, #0x30\n"
- "bge 27f\n"
- "tbz x20, #5, 10f\n"
- "ld1 { v8.16b }, [x22], #0x10\n"
- "ld1 { v13.16b }, [x22], #0x10\n"
- "tbz x20, #3, 6f\n"
- "ldr d12, [x22], #0x8\n"
- "tbz x20, #2, 4f\n"
- "ld1 { v12.s }[2], [x22], #0x4\n"
- "tbz x20, #1, 3f\n"
- "ld1 { v12.h }[6], [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v12.b }[14], [x22]\n"
- "b 26f\n"
- "3:" // 4 rounds: Partial load: partial_1_44
- "tbz x20, #0, 26f\n"
- "ld1 { v12.b }[12], [x22]\n"
- "b 26f\n"
- "4:" // 4 rounds: Partial load: partial_2_40
- "tbz x20, #1, 5f\n"
- "ld1 { v12.h }[4], [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v12.b }[10], [x22]\n"
- "b 26f\n"
- "5:" // 4 rounds: Partial load: partial_1_40
- "tbz x20, #0, 26f\n"
- "ld1 { v12.b }[8], [x22]\n"
- "b 26f\n"
- "6:" // 4 rounds: Partial load: partial_4_32
- "tbz x20, #2, 8f\n"
- "ldr s12, [x22], #0x4\n"
- "tbz x20, #1, 7f\n"
- "ld1 { v12.h }[2], [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v12.b }[6], [x22]\n"
- "b 26f\n"
- "7:" // 4 rounds: Partial load: partial_1_36
- "tbz x20, #0, 26f\n"
- "ld1 { v12.b }[4], [x22]\n"
- "b 26f\n"
- "8:" // 4 rounds: Partial load: partial_2_32
- "tbz x20, #1, 9f\n"
- "ldr h12, [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v12.b }[2], [x22]\n"
- "b 26f\n"
- "9:" // 4 rounds: Partial load: partial_1_32
- "tbz x20, #0, 26f\n"
- "ldr b12, [x22, #0x0]\n"
- "b 26f\n"
- "10:" // 4 rounds: Partial load: partial_16_0
- "tbz x20, #4, 18f\n"
- "ld1 { v8.16b }, [x22], #0x10\n"
- "tbz x20, #3, 14f\n"
- "ldr d13, [x22], #0x8\n"
- "tbz x20, #2, 12f\n"
- "ld1 { v13.s }[2], [x22], #0x4\n"
- "tbz x20, #1, 11f\n"
- "ld1 { v13.h }[6], [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v13.b }[14], [x22]\n"
- "b 26f\n"
- "11:" // 4 rounds: Partial load: partial_1_28
- "tbz x20, #0, 26f\n"
- "ld1 { v13.b }[12], [x22]\n"
- "b 26f\n"
- "12:" // 4 rounds: Partial load: partial_2_24
- "tbz x20, #1, 13f\n"
- "ld1 { v13.h }[4], [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v13.b }[10], [x22]\n"
- "b 26f\n"
- "13:" // 4 rounds: Partial load: partial_1_24
- "tbz x20, #0, 26f\n"
- "ld1 { v13.b }[8], [x22]\n"
- "b 26f\n"
- "14:" // 4 rounds: Partial load: partial_4_16
- "tbz x20, #2, 16f\n"
- "ldr s13, [x22], #0x4\n"
- "tbz x20, #1, 15f\n"
- "ld1 { v13.h }[2], [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v13.b }[6], [x22]\n"
- "b 26f\n"
- "15:" // 4 rounds: Partial load: partial_1_20
- "tbz x20, #0, 26f\n"
- "ld1 { v13.b }[4], [x22]\n"
- "b 26f\n"
- "16:" // 4 rounds: Partial load: partial_2_16
- "tbz x20, #1, 17f\n"
- "ldr h13, [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v13.b }[2], [x22]\n"
- "b 26f\n"
- "17:" // 4 rounds: Partial load: partial_1_16
- "tbz x20, #0, 26f\n"
- "ldr b13, [x22, #0x0]\n"
- "b 26f\n"
- "18:" // 4 rounds: Partial load: partial_8_0
- "tbz x20, #3, 22f\n"
- "ldr d8, [x22], #0x8\n"
- "tbz x20, #2, 20f\n"
- "ld1 { v8.s }[2], [x22], #0x4\n"
- "tbz x20, #1, 19f\n"
- "ld1 { v8.h }[6], [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v8.b }[14], [x22]\n"
- "b 26f\n"
- "19:" // 4 rounds: Partial load: partial_1_12
- "tbz x20, #0, 26f\n"
- "ld1 { v8.b }[12], [x22]\n"
- "b 26f\n"
- "20:" // 4 rounds: Partial load: partial_2_8
- "tbz x20, #1, 21f\n"
- "ld1 { v8.h }[4], [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v8.b }[10], [x22]\n"
- "b 26f\n"
- "21:" // 4 rounds: Partial load: partial_1_8
- "tbz x20, #0, 26f\n"
- "ld1 { v8.b }[8], [x22]\n"
- "b 26f\n"
- "22:" // 4 rounds: Partial load: partial_4_0
- "tbz x20, #2, 24f\n"
- "ldr s8, [x22], #0x4\n"
- "tbz x20, #1, 23f\n"
- "ld1 { v8.h }[2], [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v8.b }[6], [x22]\n"
- "b 26f\n"
- "23:" // 4 rounds: Partial load: partial_1_4
- "tbz x20, #0, 26f\n"
- "ld1 { v8.b }[4], [x22]\n"
- "b 26f\n"
- "24:" // 4 rounds: Partial load: partial_2_0
- "tbz x20, #1, 25f\n"
- "ldr h8, [x22], #0x2\n"
- "tbz x20, #0, 26f\n"
- "ld1 { v8.b }[2], [x22]\n"
- "b 26f\n"
- "25:" // 4 rounds: Partial load: partial_1_0
- "ldr b8, [x22, #0x0]\n"
- "26:" // 4 rounds: Partial load: Done
- "b 28f\n"
- "27:" // 4 rounds: Full load
- "ldr q8, [x22, #0x0]\n"
- "ldr q13, [x22, #0x10]\n"
- "ldr q12, [x22, #0x20]\n"
- "add x22, x22, #0x30\n"
- "28:" // 4 rounds: Load done
- "sub v0.16b, v8.16b, v11.16b\n"
- "sub v7.16b, v8.16b, v10.16b\n"
- "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n"
- "sub v6.16b, v8.16b, v9.16b\n"
- "sub v5.16b, v13.16b, v11.16b\n"
- "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n"
- "sub v4.16b, v13.16b, v10.16b\n"
- "sub v3.16b, v13.16b, v9.16b\n"
- "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
- "sub v2.16b, v12.16b, v11.16b\n"
- "sub v1.16b, v12.16b, v10.16b\n"
- "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
- "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
- "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
- "orr v8.16b, v8.16b, v0.16b\n"
- "sub v0.16b, v12.16b, v9.16b\n"
- "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
- "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
- "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n"
- "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
- "orr v7.16b, v7.16b, v6.16b\n"
- "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
- "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
- "orr v13.16b, v13.16b, v5.16b\n"
- "orr v4.16b, v4.16b, v3.16b\n"
- "orr v12.16b, v12.16b, v2.16b\n"
- "cmp x20, #0x30\n"
- "orr v1.16b, v1.16b, v0.16b\n"
- "orr v8.16b, v8.16b, v7.16b\n"
- "orr v13.16b, v13.16b, v4.16b\n"
- "orr v12.16b, v12.16b, v1.16b\n"
- "bge 53f\n"
- "tbz x20, #5, 36f\n"
- "st1 { v8.16b }, [x21], #0x10\n"
- "st1 { v13.16b }, [x21], #0x10\n"
- "tbz x20, #3, 32f\n"
- "str d12, [x21], #0x8\n"
- "tbz x20, #2, 30f\n"
- "st1 { v12.s }[2], [x21], #0x4\n"
- "tbz x20, #1, 29f\n"
- "st1 { v12.h }[6], [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v12.b }[14], [x21]\n"
- "b 52f\n"
- "29:" // 4 rounds: Partial writeback: partial_1_44
- "tbz x20, #0, 52f\n"
- "st1 { v12.b }[12], [x21]\n"
- "b 52f\n"
- "30:" // 4 rounds: Partial writeback: partial_2_40
- "tbz x20, #1, 31f\n"
- "st1 { v12.h }[4], [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v12.b }[10], [x21]\n"
- "b 52f\n"
- "31:" // 4 rounds: Partial writeback: partial_1_40
- "tbz x20, #0, 52f\n"
- "st1 { v12.b }[8], [x21]\n"
- "b 52f\n"
- "32:" // 4 rounds: Partial writeback: partial_4_32
- "tbz x20, #2, 34f\n"
- "str s12, [x21], #0x4\n"
- "tbz x20, #1, 33f\n"
- "st1 { v12.h }[2], [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v12.b }[6], [x21]\n"
- "b 52f\n"
- "33:" // 4 rounds: Partial writeback: partial_1_36
- "tbz x20, #0, 52f\n"
- "st1 { v12.b }[4], [x21]\n"
- "b 52f\n"
- "34:" // 4 rounds: Partial writeback: partial_2_32
- "tbz x20, #1, 35f\n"
- "str h12, [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v12.b }[2], [x21]\n"
- "b 52f\n"
- "35:" // 4 rounds: Partial writeback: partial_1_32
- "tbz x20, #0, 52f\n"
- "str b12, [x21, #0x0]\n"
- "b 52f\n"
- "36:" // 4 rounds: Partial writeback: partial_16_0
- "tbz x20, #4, 44f\n"
- "st1 { v8.16b }, [x21], #0x10\n"
- "tbz x20, #3, 40f\n"
- "str d13, [x21], #0x8\n"
- "tbz x20, #2, 38f\n"
- "st1 { v13.s }[2], [x21], #0x4\n"
- "tbz x20, #1, 37f\n"
- "st1 { v13.h }[6], [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v13.b }[14], [x21]\n"
- "b 52f\n"
- "37:" // 4 rounds: Partial writeback: partial_1_28
- "tbz x20, #0, 52f\n"
- "st1 { v13.b }[12], [x21]\n"
- "b 52f\n"
- "38:" // 4 rounds: Partial writeback: partial_2_24
- "tbz x20, #1, 39f\n"
- "st1 { v13.h }[4], [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v13.b }[10], [x21]\n"
- "b 52f\n"
- "39:" // 4 rounds: Partial writeback: partial_1_24
- "tbz x20, #0, 52f\n"
- "st1 { v13.b }[8], [x21]\n"
- "b 52f\n"
- "40:" // 4 rounds: Partial writeback: partial_4_16
- "tbz x20, #2, 42f\n"
- "str s13, [x21], #0x4\n"
- "tbz x20, #1, 41f\n"
- "st1 { v13.h }[2], [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v13.b }[6], [x21]\n"
- "b 52f\n"
- "41:" // 4 rounds: Partial writeback: partial_1_20
- "tbz x20, #0, 52f\n"
- "st1 { v13.b }[4], [x21]\n"
- "b 52f\n"
- "42:" // 4 rounds: Partial writeback: partial_2_16
- "tbz x20, #1, 43f\n"
- "str h13, [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v13.b }[2], [x21]\n"
- "b 52f\n"
- "43:" // 4 rounds: Partial writeback: partial_1_16
- "tbz x20, #0, 52f\n"
- "str b13, [x21, #0x0]\n"
- "b 52f\n"
- "44:" // 4 rounds: Partial writeback: partial_8_0
- "tbz x20, #3, 48f\n"
- "str d8, [x21], #0x8\n"
- "tbz x20, #2, 46f\n"
- "st1 { v8.s }[2], [x21], #0x4\n"
- "tbz x20, #1, 45f\n"
- "st1 { v8.h }[6], [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v8.b }[14], [x21]\n"
- "b 52f\n"
- "45:" // 4 rounds: Partial writeback: partial_1_12
- "tbz x20, #0, 52f\n"
- "st1 { v8.b }[12], [x21]\n"
- "b 52f\n"
- "46:" // 4 rounds: Partial writeback: partial_2_8
- "tbz x20, #1, 47f\n"
- "st1 { v8.h }[4], [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v8.b }[10], [x21]\n"
- "b 52f\n"
- "47:" // 4 rounds: Partial writeback: partial_1_8
- "tbz x20, #0, 52f\n"
- "st1 { v8.b }[8], [x21]\n"
- "b 52f\n"
- "48:" // 4 rounds: Partial writeback: partial_4_0
- "tbz x20, #2, 50f\n"
- "str s8, [x21], #0x4\n"
- "tbz x20, #1, 49f\n"
- "st1 { v8.h }[2], [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v8.b }[6], [x21]\n"
- "b 52f\n"
- "49:" // 4 rounds: Partial writeback: partial_1_4
- "tbz x20, #0, 52f\n"
- "st1 { v8.b }[4], [x21]\n"
- "b 52f\n"
- "50:" // 4 rounds: Partial writeback: partial_2_0
- "tbz x20, #1, 51f\n"
- "str h8, [x21], #0x2\n"
- "tbz x20, #0, 52f\n"
- "st1 { v8.b }[2], [x21]\n"
- "b 52f\n"
- "51:" // 4 rounds: Partial writeback: partial_1_0
- "str b8, [x21, #0x0]\n"
- "52:" // 4 rounds: Partial writeback: Done
- "b 54f\n"
- "53:" // 4 rounds: Full writeback
- "str q8, [x21, #0x0]\n"
- "str q13, [x21, #0x10]\n"
- "str q12, [x21, #0x20]\n"
- "add x21, x21, #0x30\n"
- "54:" // 4 rounds: Writeback done
- "subs x20, x20, #0x30\n"
- "bgt 2b\n"
- "add x23, x23, #0x1\n"
- "cmp x23, %x[num_strings]\n"
- "bne 1b\n"
- :
- : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length), [table] "r"(table)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23");
+ __asm__ __volatile__("ldr q16, [%x[table], #0x0]\n"
+ "ldr q17, [%x[table], #0x10]\n"
+ "mov x23, #0x0\n"
+ "ldr q18, [%x[table], #0x20]\n"
+ "ldr q19, [%x[table], #0x30]\n"
+ "ldr q20, [%x[table], #0x40]\n"
+ "ldr q21, [%x[table], #0x50]\n"
+ "ldr q22, [%x[table], #0x60]\n"
+ "ldr q23, [%x[table], #0x70]\n"
+ "ldr q24, [%x[table], #0x80]\n"
+ "ldr q25, [%x[table], #0x90]\n"
+ "ldr q26, [%x[table], #0xa0]\n"
+ "ldr q27, [%x[table], #0xb0]\n"
+ "ldr q28, [%x[table], #0xc0]\n"
+ "ldr q29, [%x[table], #0xd0]\n"
+ "ldr q30, [%x[table], #0xe0]\n"
+ "ldr q31, [%x[table], #0xf0]\n"
+ "1:" // string loop
+ "ldr x22, [%x[input], x23, LSL #0x3]\n"
+ "ldr x21, [%x[output], x23, LSL #0x3]\n"
+ "movi v11.16b, #0x40\n"
+ "movi v10.16b, #0x80\n"
+ "movi v9.16b, #0xc0\n"
+ "mov x20, %x[string_length]\n"
+ "2:" // 4 rounds: width loop
+ "cmp x20, #0x30\n"
+ "bge 27f\n"
+ "tbz x20, #5, 10f\n"
+ "ld1 { v8.16b }, [x22], #0x10\n"
+ "ld1 { v13.16b }, [x22], #0x10\n"
+ "tbz x20, #3, 6f\n"
+ "ldr d12, [x22], #0x8\n"
+ "tbz x20, #2, 4f\n"
+ "ld1 { v12.s }[2], [x22], #0x4\n"
+ "tbz x20, #1, 3f\n"
+ "ld1 { v12.h }[6], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v12.b }[14], [x22]\n"
+ "b 26f\n"
+ "3:" // 4 rounds: Partial load: partial_1_44
+ "tbz x20, #0, 26f\n"
+ "ld1 { v12.b }[12], [x22]\n"
+ "b 26f\n"
+ "4:" // 4 rounds: Partial load: partial_2_40
+ "tbz x20, #1, 5f\n"
+ "ld1 { v12.h }[4], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v12.b }[10], [x22]\n"
+ "b 26f\n"
+ "5:" // 4 rounds: Partial load: partial_1_40
+ "tbz x20, #0, 26f\n"
+ "ld1 { v12.b }[8], [x22]\n"
+ "b 26f\n"
+ "6:" // 4 rounds: Partial load: partial_4_32
+ "tbz x20, #2, 8f\n"
+ "ldr s12, [x22], #0x4\n"
+ "tbz x20, #1, 7f\n"
+ "ld1 { v12.h }[2], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v12.b }[6], [x22]\n"
+ "b 26f\n"
+ "7:" // 4 rounds: Partial load: partial_1_36
+ "tbz x20, #0, 26f\n"
+ "ld1 { v12.b }[4], [x22]\n"
+ "b 26f\n"
+ "8:" // 4 rounds: Partial load: partial_2_32
+ "tbz x20, #1, 9f\n"
+ "ldr h12, [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v12.b }[2], [x22]\n"
+ "b 26f\n"
+ "9:" // 4 rounds: Partial load: partial_1_32
+ "tbz x20, #0, 26f\n"
+ "ldr b12, [x22, #0x0]\n"
+ "b 26f\n"
+ "10:" // 4 rounds: Partial load: partial_16_0
+ "tbz x20, #4, 18f\n"
+ "ld1 { v8.16b }, [x22], #0x10\n"
+ "tbz x20, #3, 14f\n"
+ "ldr d13, [x22], #0x8\n"
+ "tbz x20, #2, 12f\n"
+ "ld1 { v13.s }[2], [x22], #0x4\n"
+ "tbz x20, #1, 11f\n"
+ "ld1 { v13.h }[6], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v13.b }[14], [x22]\n"
+ "b 26f\n"
+ "11:" // 4 rounds: Partial load: partial_1_28
+ "tbz x20, #0, 26f\n"
+ "ld1 { v13.b }[12], [x22]\n"
+ "b 26f\n"
+ "12:" // 4 rounds: Partial load: partial_2_24
+ "tbz x20, #1, 13f\n"
+ "ld1 { v13.h }[4], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v13.b }[10], [x22]\n"
+ "b 26f\n"
+ "13:" // 4 rounds: Partial load: partial_1_24
+ "tbz x20, #0, 26f\n"
+ "ld1 { v13.b }[8], [x22]\n"
+ "b 26f\n"
+ "14:" // 4 rounds: Partial load: partial_4_16
+ "tbz x20, #2, 16f\n"
+ "ldr s13, [x22], #0x4\n"
+ "tbz x20, #1, 15f\n"
+ "ld1 { v13.h }[2], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v13.b }[6], [x22]\n"
+ "b 26f\n"
+ "15:" // 4 rounds: Partial load: partial_1_20
+ "tbz x20, #0, 26f\n"
+ "ld1 { v13.b }[4], [x22]\n"
+ "b 26f\n"
+ "16:" // 4 rounds: Partial load: partial_2_16
+ "tbz x20, #1, 17f\n"
+ "ldr h13, [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v13.b }[2], [x22]\n"
+ "b 26f\n"
+ "17:" // 4 rounds: Partial load: partial_1_16
+ "tbz x20, #0, 26f\n"
+ "ldr b13, [x22, #0x0]\n"
+ "b 26f\n"
+ "18:" // 4 rounds: Partial load: partial_8_0
+ "tbz x20, #3, 22f\n"
+ "ldr d8, [x22], #0x8\n"
+ "tbz x20, #2, 20f\n"
+ "ld1 { v8.s }[2], [x22], #0x4\n"
+ "tbz x20, #1, 19f\n"
+ "ld1 { v8.h }[6], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v8.b }[14], [x22]\n"
+ "b 26f\n"
+ "19:" // 4 rounds: Partial load: partial_1_12
+ "tbz x20, #0, 26f\n"
+ "ld1 { v8.b }[12], [x22]\n"
+ "b 26f\n"
+ "20:" // 4 rounds: Partial load: partial_2_8
+ "tbz x20, #1, 21f\n"
+ "ld1 { v8.h }[4], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v8.b }[10], [x22]\n"
+ "b 26f\n"
+ "21:" // 4 rounds: Partial load: partial_1_8
+ "tbz x20, #0, 26f\n"
+ "ld1 { v8.b }[8], [x22]\n"
+ "b 26f\n"
+ "22:" // 4 rounds: Partial load: partial_4_0
+ "tbz x20, #2, 24f\n"
+ "ldr s8, [x22], #0x4\n"
+ "tbz x20, #1, 23f\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "b 26f\n"
+ "23:" // 4 rounds: Partial load: partial_1_4
+ "tbz x20, #0, 26f\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "b 26f\n"
+ "24:" // 4 rounds: Partial load: partial_2_0
+ "tbz x20, #1, 25f\n"
+ "ldr h8, [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "b 26f\n"
+ "25:" // 4 rounds: Partial load: partial_1_0
+ "ldr b8, [x22, #0x0]\n"
+ "26:" // 4 rounds: Partial load: Done
+ "b 28f\n"
+ "27:" // 4 rounds: Full load
+ "ldr q8, [x22, #0x0]\n"
+ "ldr q13, [x22, #0x10]\n"
+ "ldr q12, [x22, #0x20]\n"
+ "add x22, x22, #0x30\n"
+ "28:" // 4 rounds: Load done
+ "sub v0.16b, v8.16b, v11.16b\n"
+ "sub v7.16b, v8.16b, v10.16b\n"
+ "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n"
+ "sub v6.16b, v8.16b, v9.16b\n"
+ "sub v5.16b, v13.16b, v11.16b\n"
+ "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n"
+ "sub v4.16b, v13.16b, v10.16b\n"
+ "sub v3.16b, v13.16b, v9.16b\n"
+ "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
+ "sub v2.16b, v12.16b, v11.16b\n"
+ "sub v1.16b, v12.16b, v10.16b\n"
+ "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
+ "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
+ "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
+ "orr v8.16b, v8.16b, v0.16b\n"
+ "sub v0.16b, v12.16b, v9.16b\n"
+ "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
+ "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
+ "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n"
+ "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
+ "orr v7.16b, v7.16b, v6.16b\n"
+ "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
+ "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
+ "orr v13.16b, v13.16b, v5.16b\n"
+ "orr v4.16b, v4.16b, v3.16b\n"
+ "orr v12.16b, v12.16b, v2.16b\n"
+ "cmp x20, #0x30\n"
+ "orr v1.16b, v1.16b, v0.16b\n"
+ "orr v8.16b, v8.16b, v7.16b\n"
+ "orr v13.16b, v13.16b, v4.16b\n"
+ "orr v12.16b, v12.16b, v1.16b\n"
+ "bge 53f\n"
+ "tbz x20, #5, 36f\n"
+ "st1 { v8.16b }, [x21], #0x10\n"
+ "st1 { v13.16b }, [x21], #0x10\n"
+ "tbz x20, #3, 32f\n"
+ "str d12, [x21], #0x8\n"
+ "tbz x20, #2, 30f\n"
+ "st1 { v12.s }[2], [x21], #0x4\n"
+ "tbz x20, #1, 29f\n"
+ "st1 { v12.h }[6], [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v12.b }[14], [x21]\n"
+ "b 52f\n"
+ "29:" // 4 rounds: Partial writeback: partial_1_44
+ "tbz x20, #0, 52f\n"
+ "st1 { v12.b }[12], [x21]\n"
+ "b 52f\n"
+ "30:" // 4 rounds: Partial writeback: partial_2_40
+ "tbz x20, #1, 31f\n"
+ "st1 { v12.h }[4], [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v12.b }[10], [x21]\n"
+ "b 52f\n"
+ "31:" // 4 rounds: Partial writeback: partial_1_40
+ "tbz x20, #0, 52f\n"
+ "st1 { v12.b }[8], [x21]\n"
+ "b 52f\n"
+ "32:" // 4 rounds: Partial writeback: partial_4_32
+ "tbz x20, #2, 34f\n"
+ "str s12, [x21], #0x4\n"
+ "tbz x20, #1, 33f\n"
+ "st1 { v12.h }[2], [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v12.b }[6], [x21]\n"
+ "b 52f\n"
+ "33:" // 4 rounds: Partial writeback: partial_1_36
+ "tbz x20, #0, 52f\n"
+ "st1 { v12.b }[4], [x21]\n"
+ "b 52f\n"
+ "34:" // 4 rounds: Partial writeback: partial_2_32
+ "tbz x20, #1, 35f\n"
+ "str h12, [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v12.b }[2], [x21]\n"
+ "b 52f\n"
+ "35:" // 4 rounds: Partial writeback: partial_1_32
+ "tbz x20, #0, 52f\n"
+ "str b12, [x21, #0x0]\n"
+ "b 52f\n"
+ "36:" // 4 rounds: Partial writeback: partial_16_0
+ "tbz x20, #4, 44f\n"
+ "st1 { v8.16b }, [x21], #0x10\n"
+ "tbz x20, #3, 40f\n"
+ "str d13, [x21], #0x8\n"
+ "tbz x20, #2, 38f\n"
+ "st1 { v13.s }[2], [x21], #0x4\n"
+ "tbz x20, #1, 37f\n"
+ "st1 { v13.h }[6], [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v13.b }[14], [x21]\n"
+ "b 52f\n"
+ "37:" // 4 rounds: Partial writeback: partial_1_28
+ "tbz x20, #0, 52f\n"
+ "st1 { v13.b }[12], [x21]\n"
+ "b 52f\n"
+ "38:" // 4 rounds: Partial writeback: partial_2_24
+ "tbz x20, #1, 39f\n"
+ "st1 { v13.h }[4], [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v13.b }[10], [x21]\n"
+ "b 52f\n"
+ "39:" // 4 rounds: Partial writeback: partial_1_24
+ "tbz x20, #0, 52f\n"
+ "st1 { v13.b }[8], [x21]\n"
+ "b 52f\n"
+ "40:" // 4 rounds: Partial writeback: partial_4_16
+ "tbz x20, #2, 42f\n"
+ "str s13, [x21], #0x4\n"
+ "tbz x20, #1, 41f\n"
+ "st1 { v13.h }[2], [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v13.b }[6], [x21]\n"
+ "b 52f\n"
+ "41:" // 4 rounds: Partial writeback: partial_1_20
+ "tbz x20, #0, 52f\n"
+ "st1 { v13.b }[4], [x21]\n"
+ "b 52f\n"
+ "42:" // 4 rounds: Partial writeback: partial_2_16
+ "tbz x20, #1, 43f\n"
+ "str h13, [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v13.b }[2], [x21]\n"
+ "b 52f\n"
+ "43:" // 4 rounds: Partial writeback: partial_1_16
+ "tbz x20, #0, 52f\n"
+ "str b13, [x21, #0x0]\n"
+ "b 52f\n"
+ "44:" // 4 rounds: Partial writeback: partial_8_0
+ "tbz x20, #3, 48f\n"
+ "str d8, [x21], #0x8\n"
+ "tbz x20, #2, 46f\n"
+ "st1 { v8.s }[2], [x21], #0x4\n"
+ "tbz x20, #1, 45f\n"
+ "st1 { v8.h }[6], [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v8.b }[14], [x21]\n"
+ "b 52f\n"
+ "45:" // 4 rounds: Partial writeback: partial_1_12
+ "tbz x20, #0, 52f\n"
+ "st1 { v8.b }[12], [x21]\n"
+ "b 52f\n"
+ "46:" // 4 rounds: Partial writeback: partial_2_8
+ "tbz x20, #1, 47f\n"
+ "st1 { v8.h }[4], [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v8.b }[10], [x21]\n"
+ "b 52f\n"
+ "47:" // 4 rounds: Partial writeback: partial_1_8
+ "tbz x20, #0, 52f\n"
+ "st1 { v8.b }[8], [x21]\n"
+ "b 52f\n"
+ "48:" // 4 rounds: Partial writeback: partial_4_0
+ "tbz x20, #2, 50f\n"
+ "str s8, [x21], #0x4\n"
+ "tbz x20, #1, 49f\n"
+ "st1 { v8.h }[2], [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v8.b }[6], [x21]\n"
+ "b 52f\n"
+ "49:" // 4 rounds: Partial writeback: partial_1_4
+ "tbz x20, #0, 52f\n"
+ "st1 { v8.b }[4], [x21]\n"
+ "b 52f\n"
+ "50:" // 4 rounds: Partial writeback: partial_2_0
+ "tbz x20, #1, 51f\n"
+ "str h8, [x21], #0x2\n"
+ "tbz x20, #0, 52f\n"
+ "st1 { v8.b }[2], [x21]\n"
+ "b 52f\n"
+ "51:" // 4 rounds: Partial writeback: partial_1_0
+ "str b8, [x21, #0x0]\n"
+ "52:" // 4 rounds: Partial writeback: Done
+ "b 54f\n"
+ "53:" // 4 rounds: Full writeback
+ "str q8, [x21, #0x0]\n"
+ "str q13, [x21, #0x10]\n"
+ "str q12, [x21, #0x20]\n"
+ "add x21, x21, #0x30\n"
+ "54:" // 4 rounds: Writeback done
+ "subs x20, x20, #0x30\n"
+ "bgt 2b\n"
+ "add x23, x23, #0x1\n"
+ "cmp x23, %x[num_strings]\n"
+ "bne 1b\n"
+ :
+ : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output),
+ [string_length] "r"(string_length), [table] "r"(table)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+ "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+ "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23");
}
#endif // __aarch64__
diff --git a/src/cpu/kernels/lut/generic/sve2/u8.cpp b/src/cpu/kernels/lut/generic/sve2/u8.cpp
index b80d75326e..ee8572703e 100644
--- a/src/cpu/kernels/lut/generic/sve2/u8.cpp
+++ b/src/cpu/kernels/lut/generic/sve2/u8.cpp
@@ -32,11 +32,7 @@ namespace arm_compute
namespace cpu
{
void lut_u8_sve2(
- const uint8_t *table,
- size_t num_strings,
- size_t string_length,
- const uint8_t *const *input,
- uint8_t *const *output)
+ const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
{
__asm__ __volatile__(
"ptrue p0.b\n"
@@ -636,7 +632,9 @@ void lut_u8_sve2(
"bne 2b\n"
: [table] "+&r"(table)
: [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1",
+ "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21",
+ "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
}
} // namespace cpu
diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h
index 7a2afc6927..da90346267 100644
--- a/src/cpu/kernels/lut/list.h
+++ b/src/cpu/kernels/lut/list.h
@@ -34,13 +34,9 @@ namespace cpu
{
#ifdef __aarch64__
-#define DECLARE_LUT_KERNEL(func_name) \
- void func_name( \
- const uint8_t *table, \
- size_t num_strings, \
- size_t string_length, \
- const uint8_t *const *input, \
- uint8_t *const *output)
+#define DECLARE_LUT_KERNEL(func_name) \
+ void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \
+ uint8_t *const *output)
DECLARE_LUT_KERNEL(lut_u8_neon);
DECLARE_LUT_KERNEL(lut_u8_sve2);
diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.h b/src/cpu/kernels/maxunpool/generic/neon/impl.h
index 5fe19c4707..73a5b86a2f 100644
--- a/src/cpu/kernels/maxunpool/generic/neon/impl.h
+++ b/src/cpu/kernels/maxunpool/generic/neon/impl.h
@@ -25,6 +25,7 @@
#define ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
@@ -37,13 +38,15 @@ void max_unpooling(const ITensor *input, const ITensor *indices, ITensor *output
Iterator indices_itr(indices, window);
auto out_ptr = reinterpret_cast<T *>(output->buffer());
const int out_stride_w = static_cast<int>(output->info()->strides_in_bytes()[3]);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- auto vindices = reinterpret_cast<uint32_t *>(indices_itr.ptr());
- auto vinput = reinterpret_cast<T *>(input_itr.ptr());
- out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
- },
- input_itr, indices_itr);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ auto vindices = reinterpret_cast<uint32_t *>(indices_itr.ptr());
+ auto vinput = reinterpret_cast<T *>(input_itr.ptr());
+ out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
+ },
+ input_itr, indices_itr);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
index 96e4030268..6470f391e2 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
@@ -23,9 +23,9 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
namespace arm_compute
{
@@ -45,64 +45,66 @@ void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, fl
Iterator input_itr(input, win);
Iterator output_itr(output, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- auto in_ptr = reinterpret_cast<const float16_t *>(input_itr.ptr());
- auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ int x = window_start_x;
+ auto in_ptr = reinterpret_cast<const float16_t *>(input_itr.ptr());
+ auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
- float16x8_t sum_vec = vdupq_n_f16(static_cast<float16_t>(0.0f));
- float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
+ float16x8_t sum_vec = vdupq_n_f16(static_cast<float16_t>(0.0f));
+ float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- float16x8_t data = vld1q_f16(in_ptr + x);
- sum_vec = vaddq_f16(sum_vec, data);
- float32x4_t dl = vcvt_f32_f16(vget_low_f16(data));
- float32x4_t dh = vcvt_f32_f16(vget_high_f16(data));
- sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
- sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
- }
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ float16x8_t data = vld1q_f16(in_ptr + x);
+ sum_vec = vaddq_f16(sum_vec, data);
+ float32x4_t dl = vcvt_f32_f16(vget_low_f16(data));
+ float32x4_t dh = vcvt_f32_f16(vget_high_f16(data));
+ sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
+ sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
+ }
- float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec));
- sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res);
- sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res);
+ float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec));
+ sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res);
+ sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res);
- float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec);
- sum_sq_carry_res = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res);
+ float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec);
+ sum_sq_carry_res = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res);
- float16_t sum = vget_lane_f16(sum_carry_res, 0);
- float sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0);
+ float16_t sum = vget_lane_f16(sum_carry_res, 0);
+ float sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- float16_t data = *(in_ptr + x);
- sum += data;
- float fdata = static_cast<float>(data);
- sum_sq += fdata * fdata;
- }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ float16_t data = *(in_ptr + x);
+ sum += data;
+ float fdata = static_cast<float>(data);
+ sum_sq += fdata * fdata;
+ }
- float16_t mean = sum / input->info()->dimension(0);
- float var = (sum_sq / input->info()->dimension(0)) - (mean * mean);
- float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon));
+ float16_t mean = sum / input->info()->dimension(0);
+ float var = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+ float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon));
- float16x8_t mean_vec = vdupq_n_f16(mean);
- float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv);
+ float16x8_t mean_vec = vdupq_n_f16(mean);
+ float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv);
- for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- float16x8_t data = vld1q_f16(in_ptr + x);
- float16x8_t res = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec);
- // Store results
- vst1q_f16(out_ptr + x, res);
- }
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
- }
- },
- input_itr, output_itr);
+ for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ float16x8_t data = vld1q_f16(in_ptr + x);
+ float16x8_t res = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec);
+ // Store results
+ vst1q_f16(out_ptr + x, res);
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+ }
+ },
+ input_itr, output_itr);
}
void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window)
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
index 0522d6e277..11f6294a35 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
@@ -23,6 +23,7 @@
*/
#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -45,60 +46,62 @@ void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, c
Iterator input_itr(input, win);
Iterator output_itr(output, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- auto in_ptr = reinterpret_cast<const ScalarType *>(input_itr.ptr());
- auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ int x = window_start_x;
+ auto in_ptr = reinterpret_cast<const ScalarType *>(input_itr.ptr());
+ auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
- auto sum_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
- auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+ auto sum_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+ auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- auto data = wrapper::vloadq(in_ptr + x);
- sum_vec = wrapper::vadd(sum_vec, data);
- sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
- }
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ auto data = wrapper::vloadq(in_ptr + x);
+ sum_vec = wrapper::vadd(sum_vec, data);
+ sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
+ }
- auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
- auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
- for(int i = 0; i < size / 4; ++i)
- {
- sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res);
- sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
- }
+ auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
+ auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
+ for (int i = 0; i < size / 4; ++i)
+ {
+ sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res);
+ sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
+ }
- auto sum = wrapper::vgetlane(sum_carry_res, 0);
- auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
+ auto sum = wrapper::vgetlane(sum_carry_res, 0);
+ auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- ScalarType data = *(in_ptr + x);
- sum += data;
- sum_sq += data * data;
- }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ ScalarType data = *(in_ptr + x);
+ sum += data;
+ sum_sq += data * data;
+ }
- ScalarType mean = sum / input->info()->dimension(0);
- ScalarType var = (sum_sq / input->info()->dimension(0)) - (mean * mean);
- ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
+ ScalarType mean = sum / input->info()->dimension(0);
+ ScalarType var = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+ ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
- auto mean_vec = wrapper::vdup_n(mean, ExactTagType{});
- auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
- for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- auto data = wrapper::vloadq(in_ptr + x);
- auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
- // Store results
- wrapper::vstore(out_ptr + x, res);
- }
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
- }
- },
- input_itr, output_itr);
+ auto mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+ auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
+ for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ auto data = wrapper::vloadq(in_ptr + x);
+ auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
+ // Store results
+ wrapper::vstore(out_ptr + x, res);
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+ }
+ },
+ input_itr, output_itr);
}
template void mean_stddev_normalization<float, 4>(ITensor *input, ITensor *output, float epsilon, const Window &window);
} // namespace cpu
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
index 53af1e4b16..32654df5dc 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
@@ -69,77 +70,76 @@ void neon_qasymm8_meanstddevnorm(ITensor *input, ITensor *output, float epsilon,
const float32x4_t quant_min_vec = vdupq_n_f32(0.0f);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- int x = window_start_x;
- auto in_ptr = reinterpret_cast<const uint8_t *>(input_itr.ptr());
- auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr());
+ win,
+ [&](const Coordinates &)
+ {
+ int x = window_start_x;
+ auto in_ptr = reinterpret_cast<const uint8_t *>(input_itr.ptr());
+ auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr());
- uint32x4_t sum_vec = vdupq_n_u32(0);
- uint32x4_t sum_sq_vec = vdupq_n_u32(0);
+ uint32x4_t sum_vec = vdupq_n_u32(0);
+ uint32x4_t sum_sq_vec = vdupq_n_u32(0);
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t data = vld1q_u8(in_ptr + x);
- sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data)));
- const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data));
- const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data));
- sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high)));
- }
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t data = vld1q_u8(in_ptr + x);
+ sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data)));
+ const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data));
+ const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data));
+ sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high)));
+ }
#ifdef __aarch64__
- sum_vec = vpaddq_u32(sum_vec, sum_vec);
- sum_vec = vpaddq_u32(sum_vec, sum_vec);
- uint32_t sum = vgetq_lane_u32(sum_vec, 0);
- sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec);
- sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec);
- uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0);
+ sum_vec = vpaddq_u32(sum_vec, sum_vec);
+ sum_vec = vpaddq_u32(sum_vec, sum_vec);
+ uint32_t sum = vgetq_lane_u32(sum_vec, 0);
+ sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+ sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+ uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0);
#elif __arm__ // #ifdef __aarch64__
- uint32_t sum = vgetq_lane_u32(sum_vec, 0) +
- vgetq_lane_u32(sum_vec, 1) +
- vgetq_lane_u32(sum_vec, 2) +
- vgetq_lane_u32(sum_vec, 3);
+ uint32_t sum = vgetq_lane_u32(sum_vec, 0) + vgetq_lane_u32(sum_vec, 1) + vgetq_lane_u32(sum_vec, 2) +
+ vgetq_lane_u32(sum_vec, 3);
- uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) +
- vgetq_lane_u32(sum_sq_vec, 1) +
- vgetq_lane_u32(sum_sq_vec, 2) +
- vgetq_lane_u32(sum_sq_vec, 3);
+ uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + vgetq_lane_u32(sum_sq_vec, 1) +
+ vgetq_lane_u32(sum_sq_vec, 2) + vgetq_lane_u32(sum_sq_vec, 3);
#endif // #ifdef __aarch64__
- for(; x < window_end_x; ++x)
- {
- auto data = static_cast<uint32_t>(*(in_ptr + x));
- sum += data;
- sum_sq += (data * data);
- }
+ for (; x < window_end_x; ++x)
+ {
+ auto data = static_cast<uint32_t>(*(in_ptr + x));
+ sum += data;
+ sum_sq += (data * data);
+ }
- const float mean = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0)));
- const float var = (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean);
- const float stdev_inv = 1.0f / sqrtf(var + epsilon);
- const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale);
- const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset);
- for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t data = vld1q_u8(in_ptr + x);
- float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data)))));
- float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data)))));
- float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data)))));
- float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data)))));
- db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec);
- db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec);
- db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec);
- db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec);
- const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4));
- vst1q_u8(out_ptr + x, out);
- }
+ const float mean = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0)));
+ const float var =
+ (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean);
+ const float stdev_inv = 1.0f / sqrtf(var + epsilon);
+ const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale);
+ const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset);
+ for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t data = vld1q_u8(in_ptr + x);
+ float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data)))));
+ float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data)))));
+ float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data)))));
+ float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data)))));
+ db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec);
+ db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec);
+ db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec);
+ db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec);
+ const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4));
+ vst1q_u8(out_ptr + x, out);
+ }
- for(; x < window_end_x; ++x)
- {
- auto data = static_cast<float32_t>(*(in_ptr + x));
- const uint8_t res = data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset);
- *(out_ptr + x) = res;
- }
- },
- input_itr, output_itr);
+ for (; x < window_end_x; ++x)
+ {
+ auto data = static_cast<float32_t>(*(in_ptr + x));
+ const uint8_t res =
+ data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset);
+ *(out_ptr + x) = res;
+ }
+ },
+ input_itr, output_itr);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp
index 4e15d3ad3f..4af59c2ad4 100644
--- a/src/cpu/kernels/pool2d/neon/fp16.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp16.cpp
@@ -25,8 +25,9 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include "src/cpu/kernels/pool2d/neon/list.h"
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
@@ -37,7 +38,12 @@ namespace cpu
{
namespace
{
-void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_f16_maxpool_indices(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
@@ -53,8 +59,8 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
const int pool_pad_top = pool_info.pad_stride_info.pad_top();
const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
const int pad_right = src->info()->padding().right;
@@ -63,97 +69,114 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z());
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
- const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
- const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
- const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
- const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
- {
- const auto in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
- const auto in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
- const auto in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
- const auto in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
- const auto v_x0 = vld1q_f16(in_x0_ptr);
- const auto v_x1 = vld1q_f16(in_x1_ptr);
- const auto v_x2 = vld1q_f16(in_x2_ptr);
- const auto v_x3 = vld1q_f16(in_x3_ptr);
- float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
- // Store result
- vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-
- const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
- const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
- const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
- const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
- const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
- const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
- const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
- const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
- const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
- const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
- const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
- const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
- const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
- const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
- const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
- const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
- const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
- const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
- const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
- const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
- const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
- // Store indicies
- vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
- vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
{
- const auto x0 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
- const auto x1 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
- const auto x2 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
- const auto x3 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
- float16_t res = std::max(std::max(x2, x3), std::max(x0, x1));
-
- // Store result
- *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-
- const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
- const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
- const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
- const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
- const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
- const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
- const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
- // Store indices
- *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
- }
- },
- in, out, indices);
-}
-}
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ const int pool_limit_y = pool_pad_top - idx_height;
+ const int pool_limit_x = pool_pad_left - idx_width;
+
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+ const int in_x0_offset =
+ (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+ const int in_x1_offset =
+ (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+ const int in_x2_offset =
+ (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+ const int in_x3_offset =
+ (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+
+ int x_off = window_start_x;
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+ {
+ const auto in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
+ const auto in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
+ const auto in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
+ const auto in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
+ const auto v_x0 = vld1q_f16(in_x0_ptr);
+ const auto v_x1 = vld1q_f16(in_x1_ptr);
+ const auto v_x2 = vld1q_f16(in_x2_ptr);
+ const auto v_x3 = vld1q_f16(in_x3_ptr);
+ float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
+ // Store result
+ vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
+
+ const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+ pool_stride_y, DataLayout::NHWC);
+ const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+ const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+ const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+ pad_horizontal * src->info()->tensor_shape()[1];
+ const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+ const uint32x4_t voffset_x0_0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+ const uint32x4_t voffset_x0_1 = {offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7};
+ const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
+ const uint32x4_t voffset_x1_0 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+ const uint32x4_t voffset_x1_1 = {offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7};
+ const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
+ const uint32x4_t voffset_x2_0 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+ const uint32x4_t voffset_x2_1 = {offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7};
+ const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
+ const uint32x4_t voffset_x3_0 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+ const uint32x4_t voffset_x3_1 = {offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7};
+ const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
+ const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
+ const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
+ const uint16x8_t tmp_indices2 =
+ vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+ const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
+ const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
+ // Store indicies
+ vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
+ vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
+ }
-void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
+ {
+ const auto x0 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
+ const auto x1 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
+ const auto x2 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
+ const auto x3 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
+ float16_t res = std::max(std::max(x2, x3), std::max(x0, x1));
+
+ // Store result
+ *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
+
+ const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+ pool_stride_y, DataLayout::NHWC);
+ const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+ const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+ const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+ pad_horizontal * src->info()->tensor_shape()[1];
+ const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+ const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
+ const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
+ const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+ // Store indices
+ *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+ }
+ },
+ in, out, indices);
+}
+} // namespace
+
+void poolingMxN_fp16_neon_nhwc(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
- if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
+ if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
{
pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
}
@@ -167,151 +190,172 @@ void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
Iterator in(src, window_src);
Iterator out(dst0, window_out);
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
const float16_t min_value = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
float16x8_t vres;
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
- const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
{
- if(pool_info.pool_type != PoolingType::MAX)
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ const int pool_limit_y = pool_pad_top - idx_height;
+ const int pool_limit_x = pool_pad_left - idx_width;
+
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+ const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+ int x_off = window_start_x;
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
{
- // Calculate scale
- const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float16x8_t scale_v = vdupq_n_f16(scale);
-
- // Perform pooling
- vres = vdupq_n_f16(0.0f);
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ if (pool_info.pool_type != PoolingType::MAX)
{
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ // Calculate scale
+ const float scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float16x8_t scale_v = vdupq_n_f16(scale);
+
+ // Perform pooling
+ vres = vdupq_n_f16(0.0f);
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
-
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
+ for (int x = pool_start_x; x < pool_end_x; ++x)
{
- vres = vaddq_f16(vres, vmulq_f16(data, data));
+ const float16x8_t data = vld1q_f16(
+ reinterpret_cast<const float16_t *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+
+ // Get power of 2 in case of l2 pooling and accumulate
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ vres = vaddq_f16(vres, vmulq_f16(data, data));
+ }
+ else
+ {
+ vres = vaddq_f16(vres, data);
+ }
}
- else
+ }
+ // Divide by scale
+ vres = vmulq_f16(vres, scale_v);
+ }
+ else
+ {
+ vres = vdupq_n_f16(min_value);
+
+ for (int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ for (int x = pool_start_x; x < pool_end_x; ++x)
{
- vres = vaddq_f16(vres, data);
+ const float16x8_t data = vld1q_f16(
+ reinterpret_cast<const float16_t *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+ vres = vmaxq_f16(vres, data);
}
}
}
- // Divide by scale
- vres = vmulq_f16(vres, scale_v);
- }
- else
- {
- vres = vdupq_n_f16(min_value);
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
{
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- vres = vmaxq_f16(vres, data);
- }
+ float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
+ vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal),
+ sqrt_reciprocal));
}
- }
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
- vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+ // Store result
+ vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
}
- // Store result
- vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- float16_t res = 0.0f;
-
- if(pool_info.pool_type != PoolingType::MAX)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
{
- // Calculate scale
- const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
+ float16_t res = 0.0f;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ if (pool_info.pool_type != PoolingType::MAX)
{
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
+ // Calculate scale
+ const float16_t scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
+ for (int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ for (int x = pool_start_x; x < pool_end_x; ++x)
{
- res += data * data;
+ const float data =
+ *(reinterpret_cast<const float16_t *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+
+ // Get power of 2 in case of l2 pooling and accumulate
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ res += data * data;
+ }
+ else
+ {
+ res += data;
+ }
}
- else
+ }
+
+ // Divide by scale
+ res *= scale;
+ }
+ else
+ {
+ res = min_value;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ for (int x = pool_start_x; x < pool_end_x; ++x)
{
- res += data;
+ const float16_t data =
+ *(reinterpret_cast<const float16_t *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+ res = std::max(res, data);
}
}
}
- // Divide by scale
- res *= scale;
- }
- else
- {
- res = min_value;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
{
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- res = std::max(res, data);
- }
+ res = std::sqrt(res);
}
- }
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = std::sqrt(res);
+ // Store result
+ *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
}
-
- // Store result
- *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
- }
- },
- in, out);
+ },
+ in, out);
}
} // namespace cpu
} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp
index a400f3a95d..aaa37863cb 100644
--- a/src/cpu/kernels/pool2d/neon/fp32.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp32.cpp
@@ -24,8 +24,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include "src/cpu/kernels/pool2d/neon/list.h"
namespace arm_compute
@@ -34,7 +35,12 @@ namespace cpu
{
namespace
{
-void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_f32_maxpool_indices(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
const int window_start_x = window.x().start();
const int window_end_x = window.x().end();
@@ -50,8 +56,8 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
const int pool_pad_top = pool_info.pad_stride_info.pad_top();
const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
float32x4_t vres;
@@ -63,89 +69,102 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z());
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-
- const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
- const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
- const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
- const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
- {
- const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
- const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
- const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
- const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
- const auto v_x0 = vld1q_f32(in_x0_ptr + x_off);
- const auto v_x1 = vld1q_f32(in_x1_ptr + x_off);
- const auto v_x2 = vld1q_f32(in_x2_ptr + x_off);
- const auto v_x3 = vld1q_f32(in_x3_ptr + x_off);
- vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
- // Store result
- vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-
- const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
- const uint32_t offset_x0 = offset_base / sizeof(float) + x_off;
- const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
- const uint32_t offset_x2 = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
- const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
- const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
- const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
- const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
- const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
- const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
- const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
- const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-
- // Store indices
- vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
{
- const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
- const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
- const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
- const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
- res = std::max(std::max(x2, x3), std::max(x0, x1));
-
- // Store result
- *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-
- const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
- const uint32_t offset_x0 = offset_base / sizeof(float) + x_off;
- const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
- const uint32_t offset_x2 = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
- const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
- const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
- const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
- const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
- // Store indices
- *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
- }
- },
- in, out, indices);
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ const int pool_limit_y = pool_pad_top - idx_height;
+ const int pool_limit_x = pool_pad_left - idx_width;
+
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+
+ const int in_x0_offset =
+ (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+ const int in_x1_offset =
+ (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+ const int in_x2_offset =
+ (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+ const int in_x3_offset =
+ (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+
+ int x_off = window_start_x;
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+ {
+ const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
+ const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
+ const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
+ const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
+ const auto v_x0 = vld1q_f32(in_x0_ptr + x_off);
+ const auto v_x1 = vld1q_f32(in_x1_ptr + x_off);
+ const auto v_x2 = vld1q_f32(in_x2_ptr + x_off);
+ const auto v_x3 = vld1q_f32(in_x3_ptr + x_off);
+ vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
+ // Store result
+ vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+
+ const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+ pool_stride_y, DataLayout::NHWC);
+ const uint32_t offset_x0 = offset_base / sizeof(float) + x_off;
+ const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+ const uint32_t offset_x2 =
+ offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+ const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+ const uint32x4_t voffset_x0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+ const uint32x4_t voffset_x1 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+ const uint32x4_t voffset_x2 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+ const uint32x4_t voffset_x3 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+ const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
+ const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
+ const uint32x4_t tmp_indices2 =
+ vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+
+ // Store indices
+ vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
+ }
+
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
+ {
+ const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
+ const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
+ const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
+ const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
+ res = std::max(std::max(x2, x3), std::max(x0, x1));
+
+ // Store result
+ *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
+
+ const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+ pool_stride_y, DataLayout::NHWC);
+ const uint32_t offset_x0 = offset_base / sizeof(float) + x_off;
+ const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+ const uint32_t offset_x2 =
+ offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+ const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+ const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
+ const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
+ const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+ // Store indices
+ *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+ }
+ },
+ in, out, indices);
}
} // namespace
-void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window)
+void poolingMxN_fp32_neon_nhwc_kernel_indices(
+ const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window)
{
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
+ const int window_start_x = window.x().start();
+ const int window_end_x = window.x().end();
constexpr int window_step_x = 4;
Window window_out = window;
@@ -160,8 +179,8 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0,
const int pool_pad_top = pool_info.pad_stride_info.pad_top();
const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
@@ -169,9 +188,9 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0,
float32x4_t vres;
uint32x4_t vidx;
- constexpr int idx_width = 1;
- constexpr int idx_height = 2;
- constexpr int idx_batch = 3;
+ constexpr int idx_width = 1;
+ constexpr int idx_height = 2;
+ constexpr int idx_batch = 3;
const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
@@ -182,89 +201,97 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0,
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
- const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
+ {
+ const int idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+ const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
- const int pool_start_x = std::max(0, -idx_width);
- const int pool_start_y = std::max(0, -idx_height);
+ const int pool_start_x = std::max(0, -idx_width);
+ const int pool_start_y = std::max(0, -idx_height);
- const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width);
- const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height);
+ const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width);
+ const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height);
- const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride;
+ const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride;
- const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride);
- const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride);
+ const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride);
+ const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride);
- int x_off = window_start_x;
+ int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
- {
- vres = vdupq_n_f32(min_value);
- vidx = vdupq_n_u32(0U);
- const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
- uint32_t curr_kernel_index = pool_size_x * pool_start_y;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
- curr_kernel_index += pool_start_x;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ vres = vdupq_n_f32(min_value);
+ vidx = vdupq_n_u32(0U);
+ const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+ uint32_t curr_kernel_index = pool_size_x * pool_start_y;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x));
- const uint32x4_t vidx_curr = vdupq_n_u32(curr_kernel_index);
- const uint32x4_t idxMask = vcgtq_f32(data, vres);
- vidx = vbslq_u32(idxMask, vidx_curr, vidx);
- vres = vmaxq_f32(vres, data);
- in_ptr_x += y_stride;
- curr_kernel_index++;
+ const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+ curr_kernel_index += pool_start_x;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x));
+ const uint32x4_t vidx_curr = vdupq_n_u32(curr_kernel_index);
+ const uint32x4_t idxMask = vcgtq_f32(data, vres);
+ vidx = vbslq_u32(idxMask, vidx_curr, vidx);
+ vres = vmaxq_f32(vres, data);
+ in_ptr_x += y_stride;
+ curr_kernel_index++;
+ }
+ curr_kernel_index += (pool_size_x - pool_end_x);
+ in_ptr_y += z_stride;
}
- curr_kernel_index += (pool_size_x - pool_end_x);
- in_ptr_y += z_stride;
+ // Store result
+ vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+ vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx);
}
- // Store result
- vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
- vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx);
- }
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- float res = min_value;
- uint32_t idx = 0U;
- const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ float res = min_value;
+ uint32_t idx = 0U;
+ const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const float data = *(reinterpret_cast<const float *>(in_ptr_x));
- if(data > res)
+ const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+ for (int x = pool_start_x; x < pool_end_x; ++x)
{
- idx = pool_size_x * y + x;
- res = data;
+ const float data = *(reinterpret_cast<const float *>(in_ptr_x));
+ if (data > res)
+ {
+ idx = pool_size_x * y + x;
+ res = data;
+ }
+ in_ptr_x += y_stride;
}
- in_ptr_x += y_stride;
+ in_ptr_y += z_stride;
}
- in_ptr_y += z_stride;
- }
- // Store result
- *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
- *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx;
- }
- },
- out, indices);
+ // Store result
+ *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
+ *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx;
+ }
+ },
+ out, indices);
}
-void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp32_neon_nhwc(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
- if((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr))
+ if ((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr))
{
poolingMxN_fp32_neon_nhwc_kernel_indices(src, dst0, dst1, pool_info, window);
}
- else if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr))
+ else if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX &&
+ !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr))
{
pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
}
@@ -280,153 +307,174 @@ void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
Iterator in(src, window_src);
Iterator out(dst0, window_out);
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ const int pool_size_x =
+ pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+ const int pool_size_y =
+ pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
float32x4_t vres;
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
- const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
{
- if(pool_info.pool_type != PoolingType::MAX)
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ const int pool_limit_y = pool_pad_top - idx_height;
+ const int pool_limit_x = pool_pad_left - idx_width;
+
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+ const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+ int x_off = window_start_x;
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
{
- // Calculate scale
- const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float32x4_t scale_v = vdupq_n_f32(scale);
+ if (pool_info.pool_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ const float scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float32x4_t scale_v = vdupq_n_f32(scale);
- // Perform pooling
- vres = vdupq_n_f32(0.0f);
+ // Perform pooling
+ vres = vdupq_n_f32(0.0f);
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
-
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
- {
- vres = vmlaq_f32(vres, data, data);
- }
- else
+ for (int x = pool_start_x; x < pool_end_x; ++x)
{
- vres = vaddq_f32(vres, data);
+ const float32x4_t data = vld1q_f32(
+ reinterpret_cast<const float *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+
+ // Get power of 2 in case of l2 pooling and accumulate
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ vres = vmlaq_f32(vres, data, data);
+ }
+ else
+ {
+ vres = vaddq_f32(vres, data);
+ }
}
}
+ // Divide by scale
+ vres = vmulq_f32(vres, scale_v);
}
- // Divide by scale
- vres = vmulq_f32(vres, scale_v);
- }
- else
- {
- vres = vdupq_n_f32(min_value);
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ else
{
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ vres = vdupq_n_f32(min_value);
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- vres = vmaxq_f32(vres, data);
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const float32x4_t data = vld1q_f32(
+ reinterpret_cast<const float *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+ vres = vmaxq_f32(vres, data);
+ }
}
}
- }
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
- static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
- static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
- static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
- };
- vres = l2_res;
- }
-
- // Store result
- vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
- }
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ float32x4_t l2_res = {static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+ static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+ static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+ static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))};
+ vres = l2_res;
+ }
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- float res = 0.0f;
+ // Store result
+ vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+ }
- if(pool_info.pool_type != PoolingType::MAX)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
{
- // Calculate scale
- const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
+ float res = 0.0f;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ if (pool_info.pool_type != PoolingType::MAX)
{
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
+ // Calculate scale
+ const float scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
+ for (int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ for (int x = pool_start_x; x < pool_end_x; ++x)
{
- res += data * data;
+ const float data =
+ *(reinterpret_cast<const float *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+
+ // Get power of 2 in case of l2 pooling and accumulate
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ res += data * data;
+ }
+ else
+ {
+ res += data;
+ }
}
- else
+ }
+
+ // Divide by scale
+ res *= scale;
+ }
+ else
+ {
+ res = min_value;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ for (int x = pool_start_x; x < pool_end_x; ++x)
{
- res += data;
+ const float data =
+ *(reinterpret_cast<const float *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+ res = std::max(res, data);
}
}
}
- // Divide by scale
- res *= scale;
- }
- else
- {
- res = min_value;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
{
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- res = std::max(res, data);
- }
+ res = std::sqrt(res);
}
- }
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = std::sqrt(res);
+ // Store result
+ *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
}
-
- // Store result
- *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
- }
- },
- in, out);
+ },
+ in, out);
}
}
} // namespace cpu
diff --git a/src/cpu/kernels/pool2d/neon/list.h b/src/cpu/kernels/pool2d/neon/list.h
index eb141d6fcd..f8f458a63e 100644
--- a/src/cpu/kernels/pool2d/neon/list.h
+++ b/src/cpu/kernels/pool2d/neon/list.h
@@ -26,16 +26,19 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/pool2d/neon/quantized.h"
+
#include <arm_neon.h>
namespace arm_compute
{
namespace cpu
{
-#define DECLARE_POOLING_KERNEL(func_name) \
- void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
+#define DECLARE_POOLING_KERNEL(func_name) \
+ void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, \
+ const Window &window)
DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc);
DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc);
@@ -65,7 +68,12 @@ T get_initial_min(bool use_inf_as_limit)
}
template <typename T>
-inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
+inline uint32_t offset_no_padding(uint32_t padded_offset,
+ const Coordinates &id,
+ const ITensorInfo &info,
+ int pool_stride_x,
+ int pool_stride_y,
+ DataLayout data_layout)
{
const int pad_left = info.padding().left;
const int pad_right = info.padding().right;
@@ -76,22 +84,24 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id,
const int pad_horiz = pad_left + pad_right;
const int pad_vert = pad_top + pad_bottom;
- if(data_layout == DataLayout::NCHW)
+ if (data_layout == DataLayout::NCHW)
{
- const uint32_t offset_base = padded_offset
- - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
- - pad_top * sizeof(T) /* top padding */
- - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
- - in_stride_w * id[3];
+ const uint32_t offset_base =
+ padded_offset - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
+ - pad_top * sizeof(T) /* top padding */
+ - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() -
+ pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
+ - in_stride_w * id[3];
return offset_base;
}
else
{
- const uint32_t offset_base = padded_offset
- - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
- - pad_top * sizeof(T) // top padding
- - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
+ const uint32_t offset_base = padded_offset -
+ sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
+ - pad_top * sizeof(T) // top padding
+ - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() *
+ pool_stride_y // for each Z plane there are width*pad_right padding elems
- in_stride_w * id[3];
return offset_base;
@@ -100,4 +110,4 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id,
} // namespace cpu
} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H \ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H
diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
index c342b96426..ee4a67b0fb 100644
--- a/src/cpu/kernels/pool2d/neon/nchw/all.cpp
+++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
@@ -25,9 +25,11 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include "src/cpu/kernels/pool2d/neon/list.h"
+
#include <limits>
#ifdef ENABLE_NCHW_KERNELS
@@ -38,15 +40,19 @@ namespace cpu
#define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
(x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr)
#define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
- (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
-#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
- ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) ? vdup_n_f32(fval) : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+ (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) \
+ : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
+ ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) \
+ ? vdup_n_f32(fval) \
+ : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
#define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \
READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval))
-float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
+float32x4x2_t
+read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
{
float32x4x2_t vec;
vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval);
@@ -56,13 +62,14 @@ float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
+float16x4_t
+read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
{
float16_t vec[4];
const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
- for(int i = 0; i < 4; i++)
+ for (int i = 0; i < 4; i++)
{
- if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+ if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
{
vec[i] = *(ptr + i);
}
@@ -74,94 +81,106 @@ float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t,
return wrapper::vload(vec);
}
-void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling3_fp16_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dst1);
Iterator in(src, window_src);
Iterator out(dst0, window);
- constexpr const int pool_size = 3;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
- const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
- const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
- const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto x_val = id.x() * pool_stride_x;
- const auto y_val_0 = id.y() * pool_stride_y;
- const auto y_val_1 = (id.y() * pool_stride_y) + 1;
- const auto y_val_2 = (id.y() * pool_stride_y) + 2;
- float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
- x_val, y_val_0, reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
- float16x4_t middle_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
- x_val, y_val_1, reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
- float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
- x_val, y_val_2, reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
- float16x4_t res = {};
-
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
+ constexpr const int pool_size = 3;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+ const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
+ const unsigned char *const src_top_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const unsigned char *const src_middle_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ const unsigned char *const src_bottom_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- top_data = vmul_f16(top_data, top_data);
- middle_data = vmul_f16(middle_data, middle_data);
- bottom_data = vmul_f16(bottom_data, bottom_data);
- }
+ const auto x_val = id.x() * pool_stride_x;
+ const auto y_val_0 = id.y() * pool_stride_y;
+ const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+ const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+ float16x4_t top_data =
+ read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+ reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
+ float16x4_t middle_data = read_4_boundary_aware_fp16(
+ src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+ reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
+ float16x4_t bottom_data = read_4_boundary_aware_fp16(
+ src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_2,
+ reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
+ float16x4_t res = {};
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float16x4_t scale_v = vdup_n_f16(scale);
- // Perform pooling
- const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
- res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
- res = vmul_f16(vpadd_f16(res, res), scale_v);
- }
- else
- {
- const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
- res = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
- res = vpmax_f16(res, res);
- }
+ // Get power of 2 in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ top_data = vmul_f16(top_data, top_data);
+ middle_data = vmul_f16(middle_data, middle_data);
+ bottom_data = vmul_f16(bottom_data, bottom_data);
+ }
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = vsqrt_f16(res);
- }
+ if (pool_info.pool_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ const float scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h,
+ pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float16x4_t scale_v = vdup_n_f16(scale);
+ // Perform pooling
+ const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
+ res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
+ res = vmul_f16(vpadd_f16(res, res), scale_v);
+ }
+ else
+ {
+ const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
+ res = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
+ res = vpmax_f16(res, res);
+ }
+
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ res = vsqrt_f16(res);
+ }
- *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
- },
- in, out);
+ *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+ },
+ in, out);
}
template <typename T>
-inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
-f16_to_f32(float16x4_t in)
+inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type f16_to_f32(float16x4_t in)
{
- float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
+ float32x2_t out = {static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1))};
return out;
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
-f16_to_f32(float32x2_t in)
+inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type f16_to_f32(float32x2_t in)
{
return in;
}
@@ -171,9 +190,9 @@ auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int
{
T vec[2];
const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
- for(int i = 0; i < 2; i++)
+ for (int i = 0; i < 2; i++)
{
- if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+ if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
{
vec[i] = *(ptr + i);
}
@@ -186,61 +205,80 @@ auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int
}
template <typename T>
-void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_nchw_maxpool_indices(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
Iterator in(src, window_src);
Iterator out(dst0, window);
Iterator indices(dst1, window);
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const int pad_left = src->info()->padding().left;
- const int pad_right = src->info()->padding().right;
- const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
- const T float_min = get_initial_min<T>(pool_info.use_inf_as_limit);
- const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto x_val = id.x() * pool_stride_x;
- const auto y_val_0 = id.y() * pool_stride_y;
- const auto y_val_1 = (id.y() * pool_stride_y) + 1;
- auto top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
- x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
- auto bottom_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
- x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
- float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
- float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
-
- // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
- const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
- const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
- const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
- *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
-
- // Calculate max data indice, which will be used in max unpool.
- const uint32_t offset_base = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
- const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T));
- const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
- const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
- const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
- const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
- const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
- *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
- },
- in, out, indices);
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const uint8_t *const src_top_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const uint8_t *const src_bottom_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ const int pad_left = src->info()->padding().left;
+ const int pad_right = src->info()->padding().right;
+ const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
+ const T float_min = get_initial_min<T>(pool_info.use_inf_as_limit);
+ const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto x_val = id.x() * pool_stride_x;
+ const auto y_val_0 = id.y() * pool_stride_y;
+ const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+ auto top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+ reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+ auto bottom_data =
+ read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+ reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+ float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
+ float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
+
+ // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
+ const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
+ const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
+ const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
+ *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
+
+ // Calculate max data indice, which will be used in max unpool.
+ const uint32_t offset_base =
+ offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
+ const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T));
+ const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
+ const uint32x2_t voffset_top = {offset_top, offset_top + 1u};
+ const uint32x2_t voffset_bottom = {offset_bottom, offset_bottom + 1u};
+ const uint32x2_t tmp_indices_top =
+ vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
+ const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)),
+ voffset_bottom, vrev64_u32(voffset_bottom));
+ *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(
+ vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
+ },
+ in, out, indices);
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_fp16_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
- if(pool_info.pool_type == PoolingType::MAX && dst1)
+ if (pool_info.pool_type == PoolingType::MAX && dst1)
{
pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
}
@@ -254,244 +292,274 @@ void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
const int pool_pad_left = pool_info.pad_stride_info.pad_left();
const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
int pool_stride_x, pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
- const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
- const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
-
- const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_top_ptr = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
- const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
-
- const auto x_val = id.x() * pool_stride_x;
- const auto y_val_0 = id.y() * pool_stride_y;
- const auto y_val_1 = (id.y() * pool_stride_y) + 1;
- float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
- x_val, y_val_0, in_top_ptr, fill_value);
- float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
- x_val, y_val_1, in_bottom_ptr, fill_value);
- float16x4_t res = {};
-
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
+ std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+ const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+
+ const unsigned char *const src_top_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const unsigned char *const src_bottom_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- top_data = vmul_f16(top_data, top_data);
- bottom_data = vmul_f16(bottom_data, bottom_data);
- }
+ const auto in_top_ptr = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
+ const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
+
+ const auto x_val = id.x() * pool_stride_x;
+ const auto y_val_0 = id.y() * pool_stride_y;
+ const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+ float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+ y_val_0, in_top_ptr, fill_value);
+ float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+ y_val_1, in_bottom_ptr, fill_value);
+ float16x4_t res = {};
- if(pool_info.pool_type != PoolingType::MAX)
- {
- const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float16x4_t scale_v = vdup_n_f16(scale);
+ // Get power of 2 in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ top_data = vmul_f16(top_data, top_data);
+ bottom_data = vmul_f16(bottom_data, bottom_data);
+ }
- const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
- res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
- }
- else
- {
- const float16x4_t max_data = vmax_f16(top_data, bottom_data);
- res = vpmax_f16(max_data, max_data);
- }
+ if (pool_info.pool_type != PoolingType::MAX)
+ {
+ const float scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ const float16x4_t scale_v = vdup_n_f16(scale);
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = vsqrt_f16(res);
- }
+ const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
+ res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
+ }
+ else
+ {
+ const float16x4_t max_data = vmax_f16(top_data, bottom_data);
+ res = vpmax_f16(max_data, max_data);
+ }
- // Store result
- *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
- },
- in, out);
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ res = vsqrt_f16(res);
+ }
+
+ // Store result
+ *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+ },
+ in, out);
}
}
-void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp16_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dst1);
Iterator in(src, window_src);
Iterator out(dst0, window);
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
- const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
- const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float16_t res = 0.0f;
-
- if(pool_info.pool_type != PoolingType::MAX)
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+ const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- // Calculate scale
- const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
+ float16_t res = 0.0f;
- // Perform pooling
- for(int y = 0; y < pool_size_y; ++y)
+ if (pool_info.pool_type != PoolingType::MAX)
{
- for(int x = 0; x < pool_size_x; ++x)
+ // Calculate scale
+ const float16_t scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+ // Perform pooling
+ for (int y = 0; y < pool_size_y; ++y)
{
- const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
- + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+ for (int x = 0; x < pool_size_x; ++x)
+ {
+ const auto ptr = reinterpret_cast<const float16_t *>(
+ in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
- const int idx = x + id.x() * pool_stride_x - pool_pad_left;
- const int idy = y + id.y() * pool_stride_y - pool_pad_top;
- float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+ const int idx = x + id.x() * pool_stride_x - pool_pad_left;
+ const int idy = y + id.y() * pool_stride_y - pool_pad_top;
+ float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
- if(pool_info.pool_type == PoolingType::L2)
- {
- data *= data;
- }
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ data *= data;
+ }
- res += data;
+ res += data;
+ }
}
- }
- // Divide by scale
- res *= scale;
- }
- else // if max pooling
- {
- res = fp16_min;
-
- for(int y = 0; y < pool_size_y; ++y)
+ // Divide by scale
+ res *= scale;
+ }
+ else // if max pooling
{
- for(int x = 0; x < pool_size_x; ++x)
- {
- const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
- + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+ res = fp16_min;
- const int idx = x + id.x() * pool_stride_x - pool_pad_left;
- const int idy = y + id.y() * pool_stride_y - pool_pad_top;
- float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
- res = std::max(res, data);
+ for (int y = 0; y < pool_size_y; ++y)
+ {
+ for (int x = 0; x < pool_size_x; ++x)
+ {
+ const auto ptr = reinterpret_cast<const float16_t *>(
+ in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+ const int idx = x + id.x() * pool_stride_x - pool_pad_left;
+ const int idy = y + id.y() * pool_stride_y - pool_pad_top;
+ float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+ res = std::max(res, data);
+ }
}
}
- }
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = std::sqrt(res);
- }
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ res = std::sqrt(res);
+ }
- // Store result
- *(reinterpret_cast<float16_t *>(out.ptr())) = res;
- },
- in, out);
+ // Store result
+ *(reinterpret_cast<float16_t *>(out.ptr())) = res;
+ },
+ in, out);
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp32_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dst1);
Iterator in(src, window_src);
Iterator out(dst0, window);
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
- const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
- const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float res = 0.0f;
-
- if(pool_info.pool_type != PoolingType::MAX)
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
+ const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- // Calculate scale
- const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+ float res = 0.0f;
- // Perform pooling
- for(int y = 0; y < pool_size_y; ++y)
+ if (pool_info.pool_type != PoolingType::MAX)
{
- for(int x = 0; x < pool_size_x; ++x)
+ // Calculate scale
+ const float scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+ // Perform pooling
+ for (int y = 0; y < pool_size_y; ++y)
{
- const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
- + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+ for (int x = 0; x < pool_size_x; ++x)
+ {
+ const auto ptr = reinterpret_cast<const float *>(
+ in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
- const int idx = x + id.x() * pool_stride_x - pool_pad_left;
- const int idy = y + id.y() * pool_stride_y - pool_pad_top;
- float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+ const int idx = x + id.x() * pool_stride_x - pool_pad_left;
+ const int idy = y + id.y() * pool_stride_y - pool_pad_top;
+ float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
- if(pool_info.pool_type == PoolingType::L2)
- {
- data *= data;
- }
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ data *= data;
+ }
- res += data;
+ res += data;
+ }
}
- }
- // Divide by scale
- res *= scale;
- }
- else // if max pooling
- {
- res = min_value;
-
- for(int y = 0; y < pool_size_y; ++y)
+ // Divide by scale
+ res *= scale;
+ }
+ else // if max pooling
{
- for(int x = 0; x < pool_size_x; ++x)
- {
- const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
- + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+ res = min_value;
- const int idx = x + id.x() * pool_stride_x - pool_pad_left;
- const int idy = y + id.y() * pool_stride_y - pool_pad_top;
- float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
- res = std::max(res, data);
+ for (int y = 0; y < pool_size_y; ++y)
+ {
+ for (int x = 0; x < pool_size_x; ++x)
+ {
+ const auto ptr = reinterpret_cast<const float *>(
+ in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+ const int idx = x + id.x() * pool_stride_x - pool_pad_left;
+ const int idy = y + id.y() * pool_stride_y - pool_pad_top;
+ float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+ res = std::max(res, data);
+ }
}
}
- }
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = std::sqrt(res);
- }
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ res = std::sqrt(res);
+ }
- // Store result
- *(reinterpret_cast<float *>(out.ptr())) = res;
- },
- in, out);
+ // Store result
+ *(reinterpret_cast<float *>(out.ptr())) = res;
+ },
+ in, out);
}
-void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_fp32_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
- if(pool_info.pool_type == PoolingType::MAX && dst1)
+ if (pool_info.pool_type == PoolingType::MAX && dst1)
{
pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
}
@@ -499,64 +567,168 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
{
Iterator in(src, window_src);
Iterator out(dst0, window);
- constexpr int pool_size = 2;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ constexpr int pool_size = 2;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
- const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
- const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
+ const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+ const uint8_t *const src_top_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const uint8_t *const src_bottom_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+ const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
+
+ const auto x_val = id.x() * pool_stride_x;
+ const auto y_val_0 = id.y() * pool_stride_y;
+ const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+ auto top_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0,
+ in_top_ptr, fill_value);
+ auto bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+ in_bottom_ptr, fill_value);
+ float32x2_t res = {};
+ float final_res = 0;
- const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ // Get power of 2 in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ top_data = vmul_f32(top_data, top_data);
+ bottom_data = vmul_f32(bottom_data, bottom_data);
+ }
+
+ if (pool_info.pool_type != PoolingType::MAX)
+ {
+ // Calculate scale
+ float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+ pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+ pool_pad_top, pool_stride_x, pool_stride_y);
+ const float32x2_t scale_v = vdup_n_f32(scale);
+
+ // Perform pooling
+ const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+ res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+ }
+ else
+ {
+ const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+ res = vpmax_f32(max_data, max_data);
+ }
+ final_res = vget_lane_f32(res, 0);
+
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
- execute_window_loop(window, [&](const Coordinates & id)
+ // Store result
+ *(reinterpret_cast<float *>(out.ptr())) = final_res;
+ },
+ in, out);
+ }
+}
+
+void pooling3_fp32_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
+{
+ ARM_COMPUTE_UNUSED(dst1);
+ Iterator in(src, window_src);
+ Iterator out(dst0, window);
+
+ constexpr const int pool_size = 3;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
+ const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+ const uint8_t *const src_top_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+ const uint8_t *const src_middle_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+ const uint8_t *const src_bottom_ptr =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+ const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
- const auto x_val = id.x() * pool_stride_x;
- const auto y_val_0 = id.y() * pool_stride_y;
- const auto y_val_1 = (id.y() * pool_stride_y) + 1;
- auto top_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
- auto bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_bottom_ptr, fill_value);
- float32x2_t res = {};
- float final_res = 0;
+ const auto x_val = id.x() * pool_stride_x;
+ const auto y_val_0 = id.y() * pool_stride_y;
+ const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+ const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+ auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr,
+ fill_value);
+ auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+ in_middle_ptr, fill_value);
+ auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2,
+ in_bottom_ptr, fill_value);
+
+ float32x2_t res = {};
+ float final_res = 0;
// Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
+ if (pool_info.pool_type == PoolingType::L2)
{
- top_data = vmul_f32(top_data, top_data);
- bottom_data = vmul_f32(bottom_data, bottom_data);
+ top_data = vmulq_f32(top_data, top_data);
+ middle_data = vmulq_f32(middle_data, middle_data);
+ bottom_data = vmulq_f32(bottom_data, bottom_data);
}
- if(pool_info.pool_type != PoolingType::MAX)
+ if (pool_info.pool_type != PoolingType::MAX)
{
// Calculate scale
- float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
+ float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+ pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+ pool_pad_top, pool_stride_x, pool_stride_y);
const float32x2_t scale_v = vdup_n_f32(scale);
// Perform pooling
- const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
- res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+ const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+ res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+ res = vmul_f32(vpadd_f32(res, res), scale_v);
}
else
{
- const float32x2_t max_data = vmax_f32(top_data, bottom_data);
- res = vpmax_f32(max_data, max_data);
+ const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+ res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data));
+ res = vpmax_f32(res, res);
}
final_res = vget_lane_f32(res, 0);
// Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
+ if (pool_info.pool_type == PoolingType::L2)
{
final_res = sqrt(final_res);
}
@@ -565,191 +737,120 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
*(reinterpret_cast<float *>(out.ptr())) = final_res;
},
in, out);
- }
-}
-
-void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
- Iterator in(src, window_src);
- Iterator out(dst0, window);
-
- constexpr const int pool_size = 3;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
- const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
- const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
- const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset());
- const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
- const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
-
- const auto x_val = id.x() * pool_stride_x;
- const auto y_val_0 = id.y() * pool_stride_y;
- const auto y_val_1 = (id.y() * pool_stride_y) + 1;
- const auto y_val_2 = (id.y() * pool_stride_y) + 2;
- auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
- auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_middle_ptr, fill_value);
- auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, in_bottom_ptr, fill_value);
-
- float32x2_t res = {};
- float final_res = 0;
-
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- top_data = vmulq_f32(top_data, top_data);
- middle_data = vmulq_f32(middle_data, middle_data);
- bottom_data = vmulq_f32(bottom_data, bottom_data);
- }
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float32x2_t scale_v = vdup_n_f32(scale);
-
- // Perform pooling
- const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
- res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
- res = vmul_f32(vpadd_f32(res, res), scale_v);
- }
- else
- {
- const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
- res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data));
- res = vpmax_f32(res, res);
- }
- final_res = vget_lane_f32(res, 0);
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- final_res = sqrt(final_res);
- }
-
- // Store result
- *(reinterpret_cast<float *>(out.ptr())) = final_res;
- },
- in, out);
}
-void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling7_fp32_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dst1);
Iterator in(src, window_src);
Iterator out(dst0, window);
- constexpr const int pool_size = 7;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ constexpr const int pool_size = 7;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
- const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
- const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
- std::array<const uint8_t *, pool_size> src_ptrs{ {} };
- for(int i = 0; i < pool_size; ++i)
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
+ const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+ std::array<const uint8_t *, pool_size> src_ptrs{{}};
+ for (int i = 0; i < pool_size; ++i)
{
- src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+ src_ptrs[i] =
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
}
- execute_window_loop(window, [&](const Coordinates & id)
- {
- auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
-
- auto x_val = id.x() * pool_stride_x;
- auto y_val = id.y() * pool_stride_y;
- float32x4x2_t data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
- float32x2_t res = {};
- float final_res = 0.f;
+ auto x_val = id.x() * pool_stride_x;
+ auto y_val = id.y() * pool_stride_y;
+ float32x4x2_t data =
+ read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float32x2_t scale_v = vdup_n_f32(scale);
+ float32x2_t res = {};
+ float final_res = 0.f;
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
+ if (pool_info.pool_type != PoolingType::MAX)
{
- data.val[0] = vmulq_f32(data.val[0], data.val[0]);
- data.val[1] = vmulq_f32(data.val[1], data.val[1]);
- }
- float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
- for(int i = 1; i < pool_size; ++i)
- {
- in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+ // Calculate scale
+ float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+ pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+ pool_pad_top, pool_stride_x, pool_stride_y);
+ const float32x2_t scale_v = vdup_n_f32(scale);
- x_val = id.x() * pool_stride_x;
- y_val = (id.y() * pool_stride_y) + i;
- data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
// Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
+ if (pool_info.pool_type == PoolingType::L2)
{
data.val[0] = vmulq_f32(data.val[0], data.val[0]);
data.val[1] = vmulq_f32(data.val[1], data.val[1]);
}
- sum_data = vaddq_f32(sum_data, data.val[0]);
- sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+ float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+ for (int i = 1; i < pool_size; ++i)
+ {
+ in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+
+ x_val = id.x() * pool_stride_x;
+ y_val = (id.y() * pool_stride_y) + i;
+ data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr,
+ fill_value);
+ // Get power of 2 in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+ data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+ }
+ sum_data = vaddq_f32(sum_data, data.val[0]);
+ sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+ }
+ res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+ res = vmul_f32(vpadd_f32(res, res), scale_v);
}
- res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
- res = vmul_f32(vpadd_f32(res, res), scale_v);
- }
- else
- {
- for(int i = 1; i < pool_size; ++i)
+ else
{
- in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+ for (int i = 1; i < pool_size; ++i)
+ {
+ in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
- x_val = id.x() * pool_stride_x;
- y_val = (id.y() * pool_stride_y) + i;
- float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
- data = vmax2q_f32(data, temp);
+ x_val = id.x() * pool_stride_x;
+ y_val = (id.y() * pool_stride_y) + i;
+ float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val,
+ in_ptr, fill_value);
+ data = vmax2q_f32(data, temp);
+ }
+ res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1]));
+ res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
+ res = vpmax_f32(res, res);
}
- res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1]));
- res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
- res = vpmax_f32(res, res);
- }
- final_res = vget_lane_f32(res, 0);
+ final_res = vget_lane_f32(res, 0);
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- final_res = sqrt(final_res);
- }
+ // Calculate square-root in case of l2 pooling
+ if (pool_info.pool_type == PoolingType::L2)
+ {
+ final_res = sqrt(final_res);
+ }
- // Store result
- *(reinterpret_cast<float *>(out.ptr())) = final_res;
- },
- in, out);
+ // Store result
+ *(reinterpret_cast<float *>(out.ptr())) = final_res;
+ },
+ in, out);
}
} // namespace cpu
} // namespace arm_compute
-#endif // ENABLE_NCHW_KERNELS \ No newline at end of file
+#endif // ENABLE_NCHW_KERNELS
diff --git a/src/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/cpu/kernels/pool2d/neon/qasymm8.cpp
index 7f8841edd8..44675b5394 100644
--- a/src/cpu/kernels/pool2d/neon/qasymm8.cpp
+++ b/src/cpu/kernels/pool2d/neon/qasymm8.cpp
@@ -25,17 +25,23 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include "src/cpu/kernels/pool2d/neon/list.h"
namespace arm_compute
{
namespace cpu
{
-void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_qasymm8_neon_nhwc(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
index 8643651f27..d434323e89 100644
--- a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
@@ -25,17 +25,23 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include "src/cpu/kernels/pool2d/neon/list.h"
namespace arm_compute
{
namespace cpu
{
-void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h
index a2cd3991be..38f1b2f1f9 100644
--- a/src/cpu/kernels/pool2d/neon/quantized.h
+++ b/src/cpu/kernels/pool2d/neon/quantized.h
@@ -26,11 +26,13 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/PoolingHelpers.h"
#include "src/core/NEON/NEAsymm.h"
#include "src/core/NEON/NEFixedPoint.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/PoolingHelpers.h"
+
#include <arm_neon.h>
namespace arm_compute
@@ -38,7 +40,12 @@ namespace arm_compute
namespace cpu
{
template <typename T>
-void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_q8_neon_nhwc(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dst1);
@@ -60,15 +67,15 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P
using q32_t = typename wrapper::traits::promote_t<q16_t>;
using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+ const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
const int pool_pad_right = pool_info.pad_stride_info.pad_right();
const int pool_pad_top = pool_info.pad_stride_info.pad_top();
const int pool_pad_left = pool_info.pad_stride_info.pad_left();
const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
@@ -80,233 +87,267 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P
const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
// "new_offset" doesn't have to consider the "half_scale_v" in its computation
// With a requantization performed in a single step there won't be uncertainties introduced
- const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
+ const int32_t new_offset =
+ dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
- const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
- const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
- const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+ const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
+ const int32_t requant_offset =
+ dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+ const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
- const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
{
- if(pool_info.pool_type != PoolingType::MAX)
+ const int idx_width = id.y() * pool_stride_x;
+ const int idx_height = id.z() * pool_stride_y;
+ const int pool_limit_y = pool_pad_top - idx_height;
+ const int pool_limit_x = pool_pad_left - idx_width;
+
+ const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+ const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+ const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+ const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+ int x_off = window_start_x;
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
{
- q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-
- // Calculate scale
- const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
-
- // Perform pooling
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ if (pool_info.pool_type != PoolingType::MAX)
{
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+ q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+ q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+ q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+ // Calculate scale
+ const float scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+ // Perform pooling
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
-
- const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
- const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
- vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
- vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
- vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
- vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const q8x16_t data = wrapper::vloadq(
+ reinterpret_cast<const T *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+
+ const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
+ const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+ vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+ vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+ vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+ vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+ }
}
- }
- if(src_qinfo != dst_qinfo)
- {
- const float32x4x4_t vres =
+ if (src_qinfo != dst_qinfo)
{
- {
+ const float32x4x4_t vres = {{
vcvtq_f32_q32(vres1),
vcvtq_f32_q32(vres2),
vcvtq_f32_q32(vres3),
vcvtq_f32_q32(vres4),
- }
- };
- const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+ }};
+ const auto requantized_dst =
+ vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8,
+ wrapper::vgethigh(requantized_dst));
+ }
+ else
+ {
+ const float32x4_t scale_v = vdupq_n_f32(scale);
+ // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+ vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+ vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+ vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+ vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+ const q8x8_t res1 =
+ wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+ const q8x8_t res2 =
+ wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+ }
}
else
{
- const float32x4_t scale_v = vdupq_n_f32(scale);
- // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
- vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
- vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
- vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
- vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
- const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
- const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
- }
- }
- else
- {
- q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+ q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- vres = wrapper::vmax(vres, data);
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const q8x16_t data = wrapper::vloadq(
+ reinterpret_cast<const T *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+ vres = wrapper::vmax(vres, data);
+ }
}
- }
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
- requant_qinfo) :
- vres);
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+ (src_qinfo != dst_qinfo)
+ ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+ wrapper::vgethigh(vres), requant_qinfo)
+ : vres);
+ }
}
- }
- if(pool_info.pool_type == PoolingType::MAX)
- {
- for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
+ if (pool_info.pool_type == PoolingType::MAX)
{
- q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
{
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- vres = wrapper::vmax(vres, data);
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const q8x8_t data = wrapper::vload(
+ reinterpret_cast<const T *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+ vres = wrapper::vmax(vres, data);
+ }
}
- }
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
- (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+ (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+ }
}
- }
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- if(pool_info.pool_type != PoolingType::MAX)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
{
- q32_t res = static_cast<q32_t>(0.f);
+ if (pool_info.pool_type != PoolingType::MAX)
+ {
+ q32_t res = static_cast<q32_t>(0.f);
- // Calculate scale
- const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
+ // Calculate scale
+ const float scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- // Perform pooling
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ // Perform pooling
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- res += data;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const T data =
+ *(reinterpret_cast<const T *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+ res += data;
+ }
}
- }
- if(src_qinfo != dst_qinfo)
- {
- const float res_f = static_cast<float>(res);
- const float new_scale = quant_rescale / scale;
- const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+ if (src_qinfo != dst_qinfo)
+ {
+ const float res_f = static_cast<float>(res);
+ const float new_scale = quant_rescale / scale;
+ const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
- // Store result
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+ }
+ else
+ {
+ // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+ res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ }
}
else
{
- // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
- res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
-
- // Store result
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
- }
- }
- else
- {
- T res = std::numeric_limits<T>::min();
+ T res = std::numeric_limits<T>::min();
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- res = std::max(res, data);
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const T data =
+ *(reinterpret_cast<const T *>(
+ in.ptr() +
+ (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+ (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+ x_off);
+ res = std::max(res, data);
+ }
}
- }
- // Store result
- if(src_qinfo != dst_qinfo)
- {
- const float res_f = static_cast<float>(res);
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
- }
- else
- {
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ // Store result
+ if (src_qinfo != dst_qinfo)
+ {
+ const float res_f = static_cast<float>(res);
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+ }
+ else
+ {
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ }
}
}
- }
-
- },
- in, out);
+ },
+ in, out);
}
#if defined(ENABLE_NCHW_KERNELS)
template <typename T, typename TVec>
-inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
- const int pool_size, const int upper_bound_w, const int upper_bound_h,
- const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+inline void scale_vector_q16x8(bool exclude_padding,
+ TVec &v,
+ const Coordinates &id,
+ int id_offset,
+ int step,
+ const int pool_size,
+ const int upper_bound_w,
+ const int upper_bound_h,
+ const int pad_x,
+ const int pad_y,
+ const int stride_x,
+ const int stride_y)
{
int start_x = (id.x() + id_offset) * stride_x - pad_x;
int start_y = id.y() * stride_y - pad_y;
const int end_y = std::min(start_y + pool_size, upper_bound_h);
- if(exclude_padding)
+ if (exclude_padding)
{
start_y = std::max(0, start_y);
}
- std::array<T, 8> elems =
- {
- {
- wrapper::vgetlane(v, 0),
- wrapper::vgetlane(v, 1),
- wrapper::vgetlane(v, 2),
- wrapper::vgetlane(v, 3),
- wrapper::vgetlane(v, 4),
- wrapper::vgetlane(v, 5),
- wrapper::vgetlane(v, 6),
- wrapper::vgetlane(v, 7),
- }
- };
-
- for(auto &el : elems)
+ std::array<T, 8> elems = {{
+ wrapper::vgetlane(v, 0),
+ wrapper::vgetlane(v, 1),
+ wrapper::vgetlane(v, 2),
+ wrapper::vgetlane(v, 3),
+ wrapper::vgetlane(v, 4),
+ wrapper::vgetlane(v, 5),
+ wrapper::vgetlane(v, 6),
+ wrapper::vgetlane(v, 7),
+ }};
+
+ for (auto &el : elems)
{
int c_start_x = start_x;
const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
- if(exclude_padding)
+ if (exclude_padding)
{
c_start_x = std::max(0, c_start_x);
}
@@ -326,15 +367,16 @@ inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates
}
template <typename T>
-auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
+auto load16_boundary_aware(
+ int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
{
ARM_COMPUTE_UNUSED(pad_b, pad_r);
T vec[16];
//handle reading a row out of the tensor
const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
- for(int i = 0; i < 16; i++)
+ for (int i = 0; i < 16; i++)
{
- if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+ if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
{
vec[i] = *(ptr + i);
}
@@ -349,24 +391,24 @@ auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t,
template <typename T, typename V, bool deinterleave>
inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr)
{
- if(deinterleave)
+ if (deinterleave)
{
- for(int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
+ for (int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
{
*(ptr + i * 2) = lower[i];
}
- for(int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
+ for (int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
{
*(ptr + 1 + i * 2) = upper[i];
}
}
else
{
- for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
+ for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
{
*(ptr + i) = lower[i];
}
- for(int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
+ for (int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
{
*(ptr + i + 8) = upper[i];
}
@@ -376,14 +418,19 @@ inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &up
template <typename T, typename V>
inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr)
{
- for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
+ for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
{
*(ptr + i) = v[i];
}
}
template <typename T>
-void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_quantized_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dst1);
Iterator in(src, window_src);
@@ -397,129 +444,136 @@ void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
- constexpr int pool_size = 2;
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ constexpr int pool_size = 2;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
- const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
- const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+ const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const T *const src_top_ptr = reinterpret_cast<const T *>(
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+ const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
const bool have_different_qinfo = src_qinfo != dst_qinfo;
- const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
- const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
- const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const int dst_w = dst0->info()->dimension(0);
+ const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
+ const int32_t requant_offset =
+ dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+ const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const int dst_w = dst0->info()->dimension(0);
const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto x_val = id.x() * pool_stride_x;
- const auto y_val_0 = id.y() * pool_stride_y;
- const auto y_val_1 = (id.y() * pool_stride_y) + 1;
-
- auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
- x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
- auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
- x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto x_val = id.x() * pool_stride_x;
+ const auto y_val_0 = id.y() * pool_stride_y;
+ const auto y_val_1 = (id.y() * pool_stride_y) + 1;
- q8x8_t lower_res = {};
- q8x8_t upper_res = {};
+ auto top_data =
+ load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+ y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+ auto bottom_data =
+ load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+ y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
- if(pool_info.pool_type != PoolingType::MAX)
- {
- const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
- const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
+ q8x8_t lower_res = {};
+ q8x8_t upper_res = {};
- // Add rows
- const q16x8x2_t vrsum =
+ if (pool_info.pool_type != PoolingType::MAX)
{
- {
+ const q16x8x2_t top_data_q16 = {
+ {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+ const q16x8x2_t bottom_data_q16 = {
+ {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+ // Add rows
+ const q16x8x2_t vrsum = {{
wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
- }
- };
+ }};
- // Pair-wise add row data
- const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
- const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
+ // Pair-wise add row data
+ const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
+ const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
- q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
+ q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
- // Scale lower result
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- lower_res = wrapper::vmovn(res_lower);
+ // Scale lower result
+ scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, pool_size,
+ upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+ pool_stride_x, pool_stride_y);
+ lower_res = wrapper::vmovn(res_lower);
- // Compute upper result for stride_x == 1
- if(pool_stride_x == 1)
- {
- // Shifted row sum
- const q16x8x2_t vrsum_shifted =
+ // Compute upper result for stride_x == 1
+ if (pool_stride_x == 1)
{
- {
- wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
- wrapper::vext_1(vrsum.val[1], vrsum.val[1])
- }
- };
-
- // Pair-wise add shifted row
- q16x8_t res_upper = wrapper::vcombine(
- wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
- wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
-
- // Scale upper result
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- upper_res = wrapper::vmovn(res_upper);
+ // Shifted row sum
+ const q16x8x2_t vrsum_shifted = {
+ {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
+
+ // Pair-wise add shifted row
+ q16x8_t res_upper = wrapper::vcombine(
+ wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
+ wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]),
+ wrapper::vgethigh(vrsum_shifted.val[1])));
+
+ // Scale upper result
+ scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2, pool_size,
+ upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+ pool_stride_x, pool_stride_y);
+ upper_res = wrapper::vmovn(res_upper);
+ }
}
- }
- else
- {
- const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
- lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
- if(pool_stride_x == 1)
+ else
{
- const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
- upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
+ const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
+ lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
+ if (pool_stride_x == 1)
+ {
+ const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
+ upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
+ }
}
- }
- if(have_different_qinfo)
- {
- const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
- lower_res = wrapper::vgetlow(requantized_dst);
- upper_res = wrapper::vgethigh(requantized_dst);
- }
- auto out_ptr = reinterpret_cast<T *>(out.ptr());
- // Store result
- if(pool_stride_x == 1)
- {
- write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
- }
- else
- {
- write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
- }
- },
- in, out);
+ if (have_different_qinfo)
+ {
+ const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
+ lower_res = wrapper::vgetlow(requantized_dst);
+ upper_res = wrapper::vgethigh(requantized_dst);
+ }
+ auto out_ptr = reinterpret_cast<T *>(out.ptr());
+ // Store result
+ if (pool_stride_x == 1)
+ {
+ write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
+ }
+ else
+ {
+ write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
+ }
+ },
+ in, out);
}
template <typename T>
-void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling3_quantized_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dst1);
Iterator in(src, window_src);
@@ -533,13 +587,13 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
- constexpr int pool_size = 3;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ constexpr int pool_size = 3;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
@@ -547,147 +601,145 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
- const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
- const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
- const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+ const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
+ const int32_t requant_offset =
+ dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+ const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
- const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
- const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
- const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
+ const T *const src_top_ptr = reinterpret_cast<const T *>(
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+ const T *const src_middle_ptr = reinterpret_cast<const T *>(
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+ const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+ src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
const int src_w = src->info()->dimension(0);
const int src_h = src->info()->dimension(1);
const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
const int dst_w = dst0->info()->dimension(0);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto x_val = id.x() * pool_stride_x;
- const auto y_val_0 = id.y() * pool_stride_y;
- const auto y_val_1 = (id.y() * pool_stride_y) + 1;
- const auto y_val_2 = (id.y() * pool_stride_y) + 2;
-
- auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
- x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
- auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
- x_val, y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
- auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
- x_val, y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
-
- q8x8_t fres = {};
- q8x16_t fqres = {};
-
- if(pool_info.pool_type == PoolingType::AVG)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- // Convert data to u16
- const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
- const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
- const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
-
- // Calculate row sums
- const q16x8x2_t vrsum =
+ const auto x_val = id.x() * pool_stride_x;
+ const auto y_val_0 = id.y() * pool_stride_y;
+ const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+ const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+
+ auto top_data =
+ load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+ y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+ auto middle_data =
+ load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+ y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
+ auto bottom_data =
+ load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+ y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+
+ q8x8_t fres = {};
+ q8x16_t fqres = {};
+
+ if (pool_info.pool_type == PoolingType::AVG)
{
+ // Convert data to u16
+ const q16x8x2_t top_data_q16 = {
+ {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+ const q16x8x2_t middle_data_q16 = {
+ {wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data))}};
+ const q16x8x2_t bottom_data_q16 = {
+ {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+ // Calculate row sums
+ const q16x8x2_t vrsum = {{
+ wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
+ wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
+ }};
+ const q16x8x2_t vrsum_shifted_1 = {
+ {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
+ const q16x8x2_t vrsum_shifted_2 = {
+ {wrapper::vext_2(vrsum.val[0], vrsum.val[1]), wrapper::vext_2(vrsum.val[1], vrsum.val[1])}};
+ // Calculate final sum
+ q16x8x2_t final_sum = {{
+ wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
+ wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
+ }};
+ if (pool_stride_x == 2)
{
- wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
- wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
+ q16x8_t res = {
+ wrapper::vgetlane(final_sum.val[0], 0), wrapper::vgetlane(final_sum.val[0], 2),
+ wrapper::vgetlane(final_sum.val[0], 4), wrapper::vgetlane(final_sum.val[0], 6),
+ wrapper::vgetlane(final_sum.val[1], 0), wrapper::vgetlane(final_sum.val[1], 2),
+ wrapper::vgetlane(final_sum.val[1], 4), wrapper::vgetlane(final_sum.val[1], 6),
+ };
+
+ scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1, pool_size,
+ upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+ pool_stride_x, pool_stride_y);
+ fres = wrapper::vmovn(res);
}
- };
- const q16x8x2_t vrsum_shifted_1 =
- {
+ else
{
- wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
- wrapper::vext_1(vrsum.val[1], vrsum.val[1])
+ // Scale lower result
+ scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, pool_size,
+ upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+ pool_stride_x, pool_stride_y);
+ // Scale lower result
+ scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, pool_size,
+ upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+ pool_stride_x, pool_stride_y);
+ fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
}
- };
- const q16x8x2_t vrsum_shifted_2 =
+ }
+ else
{
+ const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
+ const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
+ const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
+ const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
+
+ if (pool_stride_x == 2)
{
- wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
- wrapper::vext_2(vrsum.val[1], vrsum.val[1])
+ const q8x8x2_t table = {{wrapper::vgetlow(final_max), wrapper::vgethigh(final_max)}};
+ static const q8x8_t lookup_val = {0, 2, 4, 6, 8, 10, 12, 14};
+ fres = wrapper::vtbl(table, lookup_val);
}
- };
- // Calculate final sum
- q16x8x2_t final_sum =
- {
+ else
{
- wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
- wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
+ fqres = final_max;
}
- };
- if(pool_stride_x == 2)
- {
- q16x8_t res =
- {
- wrapper::vgetlane(final_sum.val[0], 0),
- wrapper::vgetlane(final_sum.val[0], 2),
- wrapper::vgetlane(final_sum.val[0], 4),
- wrapper::vgetlane(final_sum.val[0], 6),
- wrapper::vgetlane(final_sum.val[1], 0),
- wrapper::vgetlane(final_sum.val[1], 2),
- wrapper::vgetlane(final_sum.val[1], 4),
- wrapper::vgetlane(final_sum.val[1], 6),
- };
-
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- fres = wrapper::vmovn(res);
}
- else
- {
- // Scale lower result
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- // Scale lower result
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
- }
- }
- else
- {
- const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
- const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
- const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
- const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
- if(pool_stride_x == 2)
+ // Store result
+ if (pool_stride_x == 1)
{
- const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
- static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
- fres = wrapper::vtbl(table, lookup_val);
+ if (src_qinfo != dst_qinfo)
+ {
+ fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres),
+ requant_qinfo);
+ }
+ write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres),
+ wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
}
else
{
- fqres = final_max;
- }
- }
-
- // Store result
- if(pool_stride_x == 1)
- {
- if(src_qinfo != dst_qinfo)
- {
- fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
- }
- write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
- }
- else
- {
- if(src_qinfo != dst_qinfo)
- {
- fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
+ if (src_qinfo != dst_qinfo)
+ {
+ fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
+ }
+ write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
}
- write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
- }
- },
- in, out);
+ },
+ in, out);
}
template <typename T>
-void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_quantized_neon_nchw(const ITensor *src,
+ ITensor *dst0,
+ ITensor *dst1,
+ PoolingLayerInfo &pool_info,
+ const Window &window_src,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dst1);
Iterator in(src, window_src);
@@ -697,74 +749,81 @@ void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *
using q16_t = typename wrapper::traits::promote_t<T>;
using q32_t = typename wrapper::traits::promote_t<q16_t>;
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
+ const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+ const int pool_pad_right = pool_info.pad_stride_info.pad_right();
+ const int pool_pad_top = pool_info.pad_stride_info.pad_top();
+ const int pool_pad_left = pool_info.pad_stride_info.pad_left();
+ const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
- const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
- const int src_w = src->info()->dimension(0);
- const int src_h = src->info()->dimension(1);
- const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
- const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
- const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- T res = std::numeric_limits<T>::min();
-
- if(pool_info.pool_type != PoolingType::MAX)
+ const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
+ const int src_w = src->info()->dimension(0);
+ const int src_h = src->info()->dimension(1);
+ const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
+ const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
+ const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
+
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
{
- q32_t sres = 0;
-
- // Calculate scale
- const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
+ T res = std::numeric_limits<T>::min();
- // Perform pooling
- for(int y = 0; y < pool_size_y; ++y)
+ if (pool_info.pool_type != PoolingType::MAX)
{
- for(int x = 0; x < pool_size_x; ++x)
+ q32_t sres = 0;
+
+ // Calculate scale
+ const float scale = calculate_avg_scale_pool2d(
+ pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+ upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+ // Perform pooling
+ for (int y = 0; y < pool_size_y; ++y)
{
- const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+ for (int x = 0; x < pool_size_x; ++x)
+ {
+ const auto in_ptr = reinterpret_cast<const T *>(
+ in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
- const int idx = x + id.x() * pool_stride_x - pool_pad_left;
- const int idy = y + id.y() * pool_stride_y - pool_pad_top;
- const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
- sres += data;
+ const int idx = x + id.x() * pool_stride_x - pool_pad_left;
+ const int idy = y + id.y() * pool_stride_y - pool_pad_top;
+ const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+ sres += data;
+ }
}
+ // Divide by scale
+ res = static_cast<T>(support::cpp11::round(sres * scale));
}
- // Divide by scale
- res = static_cast<T>(support::cpp11::round(sres * scale));
- }
- else
- {
- for(int y = 0; y < pool_size_y; ++y)
+ else
{
- for(int x = 0; x < pool_size_x; ++x)
+ for (int y = 0; y < pool_size_y; ++y)
{
- const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+ for (int x = 0; x < pool_size_x; ++x)
+ {
+ const auto in_ptr = reinterpret_cast<const T *>(
+ in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
- const int idx = x + id.x() * pool_stride_x - pool_pad_left;
- const int idy = y + id.y() * pool_stride_y - pool_pad_top;
- const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
- res = std::max(res, data);
+ const int idx = x + id.x() * pool_stride_x - pool_pad_left;
+ const int idy = y + id.y() * pool_stride_y - pool_pad_top;
+ const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+ res = std::max(res, data);
+ }
}
}
- }
- // Store result
- res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
- *(reinterpret_cast<T *>(out.ptr())) = res;
- },
- in, out);
+ // Store result
+ res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(
+ Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo)
+ : res;
+ *(reinterpret_cast<T *>(out.ptr())) = res;
+ },
+ in, out);
}
#endif /* defined(ENABLE_NCHW_KERNELS) */
} // namespace cpu
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
index 013e25537c..ce89199b5d 100644
--- a/src/cpu/kernels/pool3d/neon/impl.h
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -25,9 +25,10 @@
#define SRC_CORE_POOLING_3D_LAYER_IMPL_H
#include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include "src/core/helpers/PoolingHelpers.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include "src/cpu/kernels/pool3d/neon/quantized.h"
namespace arm_compute
@@ -37,8 +38,13 @@ namespace cpu
namespace
{
template <typename T>
-void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
- const int window_start_x, const int window_end_x, const int window_step_x)
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src,
+ ITensor *dst0,
+ Pooling3dLayerInfo &pool_info,
+ const Window &window_out,
+ const int window_start_x,
+ const int window_end_x,
+ const int window_step_x)
{
using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -71,80 +77,87 @@ void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
Iterator out(dst0, window_out);
vector_type vres;
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // Computing the theoretical input starting/ending points
- const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
- const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
- const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
+ {
+ // Computing the theoretical input starting/ending points
+ const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+ const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+ const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
- const int pool_start_x = std::max(0, -in_idx_width);
- const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
- const int pool_start_y = std::max(0, -in_idx_height);
- const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+ const int pool_start_x = std::max(0, -in_idx_width);
+ const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+ const int pool_start_y = std::max(0, -in_idx_height);
+ const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
- const int pool_start_z = std::max(0, -in_idx_depth);
- const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+ const int pool_start_z = std::max(0, -in_idx_depth);
+ const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
- // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
- const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
- const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
- const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+ // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+ const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+ const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+ const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
- const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+ const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
- int x_off = window_start_x;
+ int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
- {
- vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
- vres = wrapper::vmax(vres, data);
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ vres = wrapper::vmax(vres, data);
+ }
}
}
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
}
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
- }
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- T res(0);
- res = -std::numeric_limits<float>::infinity();
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ T res(0);
+ res = -std::numeric_limits<float>::infinity();
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
- res = std::max(res, data);
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ res = std::max(res, data);
+ }
}
}
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
}
- // Store result
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
- }
- },
- out);
+ },
+ out);
}
template <typename T>
-void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
- const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src,
+ ITensor *dst0,
+ Pooling3dLayerInfo &pool_info,
+ const Window &window_out,
+ const int window_start_x,
+ const int window_end_x,
+ const int window_step_x)
{
using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
using vector_type = typename vtype::type;
@@ -183,95 +196,103 @@ void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
Iterator out(dst0, window_out);
vector_type vres;
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // Computing the theoretical input starting/ending points
- const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
- const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
- const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
- const int pool_start_x = std::max(0, -in_idx_width);
- const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
- const int pool_start_y = std::max(0, -in_idx_height);
- const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
- const int pool_start_z = std::max(0, -in_idx_depth);
- const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
- // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
- const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
- const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
- const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
- const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
- // Calculate scale
- const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
- pool_pad_top, pool_pad_front, pool_stride_x,
- pool_stride_y, pool_stride_z);
- const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
+ {
+ // Computing the theoretical input starting/ending points
+ const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+ const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+ const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+ const int pool_start_x = std::max(0, -in_idx_width);
+ const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+ const int pool_start_y = std::max(0, -in_idx_height);
+ const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+ const int pool_start_z = std::max(0, -in_idx_depth);
+ const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+ // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+ const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+ const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+ const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+ const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+ // Calculate scale
+ const float scale =
+ calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+ upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+ pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+ const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
- int x_off = window_start_x;
+ int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
- {
- // Perform pooling
- vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ // Perform pooling
+ vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
- vres = wrapper::vadd(vres, data);
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ vres = wrapper::vadd(vres, data);
+ }
}
}
- }
- // Divide by scale
- vres = wrapper::vmul(vres, scale_v);
+ // Divide by scale
+ vres = wrapper::vmul(vres, scale_v);
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- T res(0);
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+ }
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ T res(0);
+
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
- res += data;
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ res += data;
+ }
}
}
- }
- // Divide by scale
- res *= scale;
+ // Divide by scale
+ res *= scale;
- // Store result
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
- }
- },
- out);
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ }
+ },
+ out);
}
template <typename T>
-void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
- const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src,
+ ITensor *dst0,
+ Pooling3dLayerInfo &pool_info,
+ const Window &window_out,
+ const int window_start_x,
+ const int window_end_x,
+ const int window_step_x)
{
using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
using vector_type = typename vtype::type;
@@ -310,97 +331,100 @@ void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dL
Iterator out(dst0, window_out);
vector_type vres;
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // Computing the theoretical input starting/ending points
- const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
- const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
- const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
+ {
+ // Computing the theoretical input starting/ending points
+ const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+ const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+ const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
- const int pool_start_x = std::max(0, -in_idx_width);
- const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
- const int pool_start_y = std::max(0, -in_idx_height);
- const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+ const int pool_start_x = std::max(0, -in_idx_width);
+ const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+ const int pool_start_y = std::max(0, -in_idx_height);
+ const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
- const int pool_start_z = std::max(0, -in_idx_depth);
- const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+ const int pool_start_z = std::max(0, -in_idx_depth);
+ const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
- // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
- const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
- const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
- const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+ // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+ const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+ const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+ const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
- const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+ const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
- // Calculate scale
- const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
- pool_pad_top, pool_pad_front, pool_stride_x,
- pool_stride_y, pool_stride_z);
+ // Calculate scale
+ const float scale =
+ calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+ upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+ pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
- int x_off = window_start_x;
+ int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
- {
- // Perform pooling
- vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ // Perform pooling
+ vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
- vres = wrapper::vmla(vres, data, data);
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ vres = wrapper::vmla(vres, data, data);
+ }
}
}
- }
-
- const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
- // Divide by scale
- vres = wrapper::vmul(vres, scale_v);
+ const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
- // Calculate square-root
- vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+ // Divide by scale
+ vres = wrapper::vmul(vres, scale_v);
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
- }
+ // Calculate square-root
+ vres = wrapper::vinv(wrapper::vinvsqrt(vres));
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- T res(0);
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+ }
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ T res(0);
+
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
- res += data * data;
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ res += data * data;
+ }
}
}
- }
- // Divide by scale
- res *= scale;
+ // Divide by scale
+ res *= scale;
- // Square root
- res = std::sqrt(res);
+ // Square root
+ res = std::sqrt(res);
- // Store result
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
- }
- },
- out);
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ }
+ },
+ out);
}
} // namespace
@@ -415,16 +439,19 @@ void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye
// Needed to handle loop left-over
window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
- switch(pool_info.pool_type)
+ switch (pool_info.pool_type)
{
case PoolingType::MAX:
- max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+ max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+ window_step_x);
break;
case PoolingType::AVG:
- avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+ avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+ window_step_x);
break;
case PoolingType::L2:
- l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+ l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+ window_step_x);
break;
default:
ARM_COMPUTE_ERROR("Pool operation not supported");
@@ -440,7 +467,7 @@ void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye
// Needed to handle loop left-over
window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
- switch(pool_info.pool_type)
+ switch (pool_info.pool_type)
{
case PoolingType::MAX:
max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h
index ac14f5eafa..8819907901 100644
--- a/src/cpu/kernels/pool3d/neon/quantized.h
+++ b/src/cpu/kernels/pool3d/neon/quantized.h
@@ -26,17 +26,18 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/PoolingHelpers.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace cpu
{
template <typename T>
-void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
- const int window_step_x)
+void avg_poolingMxNxD_q8_neon_ndhwc(
+ const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
{
using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -89,144 +90,147 @@ void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
// "new_offset" doesn't have to consider the "half_scale_v" in its computation
// With a requantization performed in a single step there won't be uncertainties introduced
- const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // Computing the theoretical input starting/ending points
- const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
- const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
- const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+ const int32_t new_offset =
+ dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
- const int pool_start_x = std::max(0, -in_idx_width);
- const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
- const int pool_start_y = std::max(0, -in_idx_height);
- const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
+ {
+ // Computing the theoretical input starting/ending points
+ const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+ const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+ const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
- const int pool_start_z = std::max(0, -in_idx_depth);
- const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+ const int pool_start_x = std::max(0, -in_idx_width);
+ const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+ const int pool_start_y = std::max(0, -in_idx_height);
+ const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
- // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
- const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
- const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
- const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+ const int pool_start_z = std::max(0, -in_idx_depth);
+ const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
- // Calculate scale
- const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
- pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+ // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+ const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+ const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+ const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
- const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+ // Calculate scale
+ const float scale =
+ calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+ upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+ pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
- int x_off = window_start_x;
+ const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
- {
- q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+ int x_off = window_start_x;
- // Perform pooling
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+ q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+ q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+ q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+ // Perform pooling
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
- const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
- const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
- vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
- vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
- vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
- vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+ const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
+ const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+ vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+ vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+ vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+ vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+ }
}
}
- }
- if(src_qinfo != dst_qinfo)
- {
- const float32x4x4_t vres =
+ if (src_qinfo != dst_qinfo)
{
- {
+ const float32x4x4_t vres = {{
vcvtq_f32_q32(vres1),
vcvtq_f32_q32(vres2),
vcvtq_f32_q32(vres3),
vcvtq_f32_q32(vres4),
- }
- };
- const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
- }
- else
- {
- const float32x4_t scale_v = vdupq_n_f32(scale);
- // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
- vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
- vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
- vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
- vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
- const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
- const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+ }};
+ const auto requantized_dst =
+ vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+ }
+ else
+ {
+ const float32x4_t scale_v = vdupq_n_f32(scale);
+ // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+ vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+ vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+ vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+ vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+ const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+ const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+ }
}
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- q32_t res = static_cast<q32_t>(0.f);
- // Perform pooling
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ q32_t res = static_cast<q32_t>(0.f);
+
+ // Perform pooling
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
- res += data;
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ res += data;
+ }
}
}
- }
- if(src_qinfo != dst_qinfo)
- {
- const float res_f = static_cast<float>(res);
- const float new_scale = quant_rescale / scale;
- const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+ if (src_qinfo != dst_qinfo)
+ {
+ const float res_f = static_cast<float>(res);
+ const float new_scale = quant_rescale / scale;
+ const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
- // Store result
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
- }
- else
- {
- // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
- res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+ }
+ else
+ {
+ // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+ res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
- // Store result
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ }
}
- }
- },
- out);
+ },
+ out);
}
template <typename T>
-void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
- const int window_step_x)
+void max_poolingMxNxD_q8_neon_ndhwc(
+ const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
{
using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -266,125 +270,130 @@ void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
- const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
- const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
- const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // Computing the theoretical input starting/ending points
- const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
- const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
- const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+ const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
+ const int32_t requant_offset =
+ dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+ const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
- const int pool_start_x = std::max(0, -in_idx_width);
- const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
- const int pool_start_y = std::max(0, -in_idx_height);
- const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+ execute_window_loop(
+ window_out,
+ [&](const Coordinates &id)
+ {
+ // Computing the theoretical input starting/ending points
+ const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+ const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+ const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
- const int pool_start_z = std::max(0, -in_idx_depth);
- const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+ const int pool_start_x = std::max(0, -in_idx_width);
+ const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+ const int pool_start_y = std::max(0, -in_idx_height);
+ const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
- // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
- const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
- const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
- const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+ const int pool_start_z = std::max(0, -in_idx_depth);
+ const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
- const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+ // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+ const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+ const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+ const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
- int x_off = window_start_x;
+ const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
- {
- q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+ int x_off = window_start_x;
- // Perform pooling
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+ // Perform pooling
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
- vres = wrapper::vmax(vres, data);
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+ vres = wrapper::vmax(vres, data);
+ }
}
}
- }
-
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
- requant_qinfo) :
- vres);
- }
- // Leftovers using half the window step
- for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
- {
- q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+ (src_qinfo != dst_qinfo)
+ ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+ wrapper::vgethigh(vres), requant_qinfo)
+ : vres);
+ }
- // Perform pooling
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ // Leftovers using half the window step
+ for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+
+ // Perform pooling
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
- vres = wrapper::vmax(vres, data);
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+ vres = wrapper::vmax(vres, data);
+ }
}
}
- }
-
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
- (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
- }
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- T res = std::numeric_limits<T>::min();
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+ (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+ }
- for(int z = pool_start_z; z < pool_end_z; ++z)
+ // Left-overs loop
+ for (; x_off < window_end_x; ++x_off)
{
- const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
- for(int y = pool_start_y; y < pool_end_y; ++y)
+ T res = std::numeric_limits<T>::min();
+
+ for (int z = pool_start_z; z < pool_end_z; ++z)
{
- const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
- for(int x = pool_start_x; x < pool_end_x; ++x)
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for (int y = pool_start_y; y < pool_end_y; ++y)
{
- const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
- const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
- res = std::max(res, data);
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for (int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+ res = std::max(res, data);
+ }
}
}
- }
- // Store result
- if(src_qinfo != dst_qinfo)
- {
- const float res_f = static_cast<float>(res);
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
- }
- else
- {
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ // Store result
+ if (src_qinfo != dst_qinfo)
+ {
+ const float res_f = static_cast<float>(res);
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+ }
+ else
+ {
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ }
}
- }
- },
- out);
+ },
+ out);
}
} // namespace cpu
} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H \ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
diff --git a/src/cpu/kernels/range/generic/neon/fp16.cpp b/src/cpu/kernels/range/generic/neon/fp16.cpp
index 5d50dce907..505c18c27d 100644
--- a/src/cpu/kernels/range/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/range/generic/neon/fp16.cpp
@@ -23,10 +23,10 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
namespace arm_compute
{
diff --git a/src/cpu/kernels/range/generic/neon/fp32.cpp b/src/cpu/kernels/range/generic/neon/fp32.cpp
index 6044f0f886..e5e472abb5 100644
--- a/src/cpu/kernels/range/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/range/generic/neon/fp32.cpp
@@ -22,10 +22,10 @@
* SOFTWARE.
*/
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
namespace arm_compute
{
diff --git a/src/cpu/kernels/range/generic/neon/impl.h b/src/cpu/kernels/range/generic/neon/impl.h
index 62144e6776..f8c30d52a0 100644
--- a/src/cpu/kernels/range/generic/neon/impl.h
+++ b/src/cpu/kernels/range/generic/neon/impl.h
@@ -26,8 +26,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
@@ -47,35 +48,36 @@ void neon_range_function(ITensor *output, float start, float step, const Window
const auto window_end_x = static_cast<int>(window.x().end());
const int window_step_x = 16 / sizeof(T);
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator output_it(output, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- for(int count = 0; count < window_step_x; ++count)
+ int x = window_start_x;
+ const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
- }
-
- // start + step * id
- const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
- wrapper::vstore(out_ptr + x, res_vec);
- }
+ for (int count = 0; count < window_step_x; ++count)
+ {
+ id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto res = start + x * step;
- *(out_ptr + x) = res;
- }
+ // start + step * id
+ const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
+ wrapper::vstore(out_ptr + x, res_vec);
+ }
- },
- output_it);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto res = start + x * step;
+ *(out_ptr + x) = res;
+ }
+ },
+ output_it);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/range/list.h b/src/cpu/kernels/range/list.h
index 25d52bfe7f..cade91e8dd 100644
--- a/src/cpu/kernels/range/list.h
+++ b/src/cpu/kernels/range/list.h
@@ -28,8 +28,7 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_RANGE_KERNEL(func_name) \
- void func_name(ITensor *output, float start, float step, const Window &window)
+#define DECLARE_RANGE_KERNEL(func_name) void func_name(ITensor *output, float start, float step, const Window &window)
DECLARE_RANGE_KERNEL(fp16_neon_range_function);
DECLARE_RANGE_KERNEL(fp32_neon_range_function);
diff --git a/src/cpu/kernels/roialign/generic/neon/fp16.cpp b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
index c265d5d4eb..cf99830562 100644
--- a/src/cpu/kernels/roialign/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
@@ -29,7 +29,12 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp16_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_fp16_roialign(const ITensor *input,
+ ITensor *output,
+ const ITensor *rois,
+ ROIPoolingLayerInfo pool_info,
+ const Window &window,
+ const ThreadInfo &info)
{
return roi_align<float16_t, float16_t>(input, output, rois, pool_info, window, info);
}
diff --git a/src/cpu/kernels/roialign/generic/neon/fp32.cpp b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
index 51355aaef0..c1dba99b5e 100644
--- a/src/cpu/kernels/roialign/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
@@ -26,7 +26,12 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp32_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_fp32_roialign(const ITensor *input,
+ ITensor *output,
+ const ITensor *rois,
+ ROIPoolingLayerInfo pool_info,
+ const Window &window,
+ const ThreadInfo &info)
{
return roi_align<float, float>(input, output, rois, pool_info, window, info);
}
diff --git a/src/cpu/kernels/roialign/generic/neon/impl.h b/src/cpu/kernels/roialign/generic/neon/impl.h
index e5e604330a..db2f67705d 100644
--- a/src/cpu/kernels/roialign/generic/neon/impl.h
+++ b/src/cpu/kernels/roialign/generic/neon/impl.h
@@ -46,7 +46,7 @@ inline input_data_type roi_align_1x1(const ITensor *input,
float region_end_y,
int pz)
{
- if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+ if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
{
return input_data_type(0);
}
@@ -55,9 +55,9 @@ inline input_data_type roi_align_1x1(const ITensor *input,
const DataLayout data_layout = input->info()->data_layout();
float avg = 0;
// Iterate through the aligned pooling region
- for(int iy = 0; iy < grid_size_y; ++iy)
+ for (int iy = 0; iy < grid_size_y; ++iy)
{
- for(int ix = 0; ix < grid_size_x; ++ix)
+ for (int ix = 0; ix < grid_size_x; ++ix)
{
// Align the window in the middle of every bin
float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
@@ -78,20 +78,28 @@ inline input_data_type roi_align_1x1(const ITensor *input,
const float w2 = hy * lx;
const float w3 = ly * hx;
const float w4 = ly * lx;
- if(data_layout == DataLayout::NCHW)
+ if (data_layout == DataLayout::NCHW)
{
- const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
- const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
- const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
- const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
+ const auto data1 = *reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
+ const auto data2 = *reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
+ const auto data3 = *reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
+ const auto data4 = *reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
}
else
{
- const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
- const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
- const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
- const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
+ const auto data1 = *reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
+ const auto data2 = *reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
+ const auto data3 = *reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
+ const auto data4 = *reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
}
}
@@ -117,21 +125,21 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor *input,
int pz,
const QuantizationInfo &out_qinfo)
{
- if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+ if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
{
return input_data_type(out_qinfo.uniform().offset);
}
else
{
- float avg = 0;
- const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
- const bool is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
- const DataLayout data_layout = input->info()->data_layout();
+ float avg = 0;
+ const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
+ const bool is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
+ const DataLayout data_layout = input->info()->data_layout();
// Iterate through the aligned pooling region
- for(int iy = 0; iy < grid_size_y; ++iy)
+ for (int iy = 0; iy < grid_size_y; ++iy)
{
- for(int ix = 0; ix < grid_size_x; ++ix)
+ for (int ix = 0; ix < grid_size_x; ++ix)
{
// Align the window in the middle of every bin
float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
@@ -153,41 +161,89 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor *input,
const float w3 = ly * hx;
const float w4 = ly * lx;
- if(data_layout == DataLayout::NCHW)
+ if (data_layout == DataLayout::NCHW)
{
- if(is_qasymm_signed)
+ if (is_qasymm_signed)
{
- float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
- float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
- float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
- float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+ float data1 =
+ dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+ Coordinates(x_low, y_low, pz, roi_batch))),
+ input_qinfo);
+ float data2 =
+ dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+ Coordinates(x_high, y_low, pz, roi_batch))),
+ input_qinfo);
+ float data3 =
+ dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+ Coordinates(x_low, y_high, pz, roi_batch))),
+ input_qinfo);
+ float data4 =
+ dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+ Coordinates(x_high, y_high, pz, roi_batch))),
+ input_qinfo);
avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
}
else
{
- float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
- float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
- float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
- float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+ float data1 =
+ dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))),
+ input_qinfo);
+ float data2 =
+ dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))),
+ input_qinfo);
+ float data3 =
+ dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))),
+ input_qinfo);
+ float data4 =
+ dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))),
+ input_qinfo);
avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
}
}
else
{
- if(is_qasymm_signed)
+ if (is_qasymm_signed)
{
- const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
- const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
- const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
- const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+ const auto data1 =
+ dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+ Coordinates(pz, x_low, y_low, roi_batch))),
+ input_qinfo);
+ const auto data2 =
+ dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+ Coordinates(pz, x_high, y_low, roi_batch))),
+ input_qinfo);
+ const auto data3 =
+ dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+ Coordinates(pz, x_low, y_high, roi_batch))),
+ input_qinfo);
+ const auto data4 =
+ dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+ Coordinates(pz, x_high, y_high, roi_batch))),
+ input_qinfo);
avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
}
else
{
- const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
- const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
- const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
- const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+ const auto data1 =
+ dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))),
+ input_qinfo);
+ const auto data2 =
+ dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))),
+ input_qinfo);
+ const auto data3 =
+ dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))),
+ input_qinfo);
+ const auto data4 =
+ dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+ input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))),
+ input_qinfo);
avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
}
}
@@ -197,7 +253,7 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor *input,
avg /= grid_size_x * grid_size_y;
input_data_type res = 0;
- if(is_qasymm_signed)
+ if (is_qasymm_signed)
{
res = quantize_qasymm8_signed(avg, out_qinfo);
}
@@ -215,7 +271,12 @@ inline float compute_region_coordinate(int p, float bin_size, float roi_anchor,
}
template <typename input_data_type, typename roi_data_type>
-void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void roi_align(const ITensor *input,
+ ITensor *output,
+ const ITensor *rois,
+ ROIPoolingLayerInfo pool_info,
+ const Window &window,
+ const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
@@ -240,7 +301,7 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo
const auto *rois_ptr = reinterpret_cast<const roi_data_type *>(rois->buffer());
const QuantizationInfo &rois_qinfo = rois->info()->quantization_info();
- for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+ for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
{
const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
@@ -252,7 +313,7 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo
float x2(qx2);
float y1(qy1);
float y2(qy2);
- if(is_qasymm)
+ if (is_qasymm)
{
x1 = dequantize_qasymm16(qx1, rois_qinfo);
x2 = dequantize_qasymm16(qx2, rois_qinfo);
@@ -267,44 +328,47 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo
float bin_size_y = roi_dims_y / pool_info.pooled_height();
// Iterate through all feature maps
- for(int ch = 0; ch < input_chanels; ++ch)
+ for (int ch = 0; ch < input_chanels; ++ch)
{
// Iterate through all output pixels
- for(int py = 0; py < pooled_h; ++py)
+ for (int py = 0; py < pooled_h; ++py)
{
- for(int px = 0; px < pooled_w; ++px)
+ for (int px = 0; px < pooled_w; ++px)
{
- const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
- const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
- const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
- const float region_end_y = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
- const int roi_bin_grid_x = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
- const int roi_bin_grid_y = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
+ const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
+ const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
+ const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
+ const float region_end_y =
+ compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
+ const int roi_bin_grid_x =
+ (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
+ const int roi_bin_grid_y =
+ (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
input_data_type out_val(0);
- if(is_qasymm)
+ if (is_qasymm)
{
out_val = roi_align_1x1_qasymm8<input_data_type>(
- input, roi_batch, region_start_x, bin_size_x,
- roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
- roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
+ input, roi_batch, region_start_x, bin_size_x, roi_bin_grid_x, region_end_x, region_start_y,
+ bin_size_y, roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
}
else
{
- out_val = roi_align_1x1<input_data_type>(
- input, roi_batch, region_start_x, bin_size_x,
- roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
- roi_bin_grid_y, region_end_y, ch);
+ out_val = roi_align_1x1<input_data_type>(input, roi_batch, region_start_x, bin_size_x,
+ roi_bin_grid_x, region_end_x, region_start_y,
+ bin_size_y, roi_bin_grid_y, region_end_y, ch);
}
- if(data_layout == DataLayout::NCHW)
+ if (data_layout == DataLayout::NCHW)
{
- auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
- *out_ptr = out_val;
+ auto out_ptr = reinterpret_cast<input_data_type *>(
+ output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
+ *out_ptr = out_val;
}
else
{
- auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
- *out_ptr = out_val;
+ auto out_ptr = reinterpret_cast<input_data_type *>(
+ output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
+ *out_ptr = out_val;
}
}
}
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
index d6bd9a95ce..11c5770f53 100644
--- a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
@@ -26,7 +26,12 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qu8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_qu8_roialign(const ITensor *input,
+ ITensor *output,
+ const ITensor *rois,
+ ROIPoolingLayerInfo pool_info,
+ const Window &window,
+ const ThreadInfo &info)
{
return roi_align<uint8_t, uint16_t>(input, output, rois, pool_info, window, info);
}
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
index a839581aff..7f93cc87b3 100644
--- a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
@@ -26,7 +26,12 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qs8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_qs8_roialign(const ITensor *input,
+ ITensor *output,
+ const ITensor *rois,
+ ROIPoolingLayerInfo pool_info,
+ const Window &window,
+ const ThreadInfo &info)
{
return roi_align<int8_t, uint16_t>(input, output, rois, pool_info, window, info);
}
diff --git a/src/cpu/kernels/roialign/list.h b/src/cpu/kernels/roialign/list.h
index 1c71b02488..fdb3c0050d 100644
--- a/src/cpu/kernels/roialign/list.h
+++ b/src/cpu/kernels/roialign/list.h
@@ -27,9 +27,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_ROIALIGN_KERNEL(func_name) \
- void func_name(const ITensor *input, ITensor *output, const ITensor *rois, \
- ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+#define DECLARE_ROIALIGN_KERNEL(func_name) \
+ void func_name(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, \
+ const Window &window, const ThreadInfo &info)
DECLARE_ROIALIGN_KERNEL(neon_fp32_roialign);
DECLARE_ROIALIGN_KERNEL(neon_fp16_roialign);
DECLARE_ROIALIGN_KERNEL(neon_qu8_roialign);
diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp
index 895f42215e..bd01569cc4 100644
--- a/src/cpu/kernels/scale/neon/fp16.cpp
+++ b/src/cpu/kernels/scale/neon/fp16.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/utils/ScaleUtils.h"
#include "support/Rounding.h"
@@ -41,8 +42,12 @@ namespace arm_compute
{
namespace
{
-void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void fp16_neon_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -62,33 +67,46 @@ void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *of
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- int32_t x = window_start_x;
- const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
- for(; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
- wrapper::vloadq(in_ptr + offset + offset_row + x));
- }
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
- }
- },
- out);
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ int32_t x = window_start_x;
+ const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
+ wrapper::vloadq(in_ptr + offset + offset_row + x));
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+ }
+ },
+ out);
}
-void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void fp16_neon_scale_bilinear(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
// Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ const auto hr =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
Iterator out(dst, window);
const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
@@ -103,68 +121,97 @@ void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *o
win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
Iterator in(src, win_in);
- if(border_mode == BorderMode::CONSTANT)
+ if (border_mode == BorderMode::CONSTANT)
{
using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+ const float16_t *in_ptr =
+ reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+ const auto a00 =
+ (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+ const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+ ? *(in_ptr + in_stride_c)
+ : const_border_value;
+ const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+ ? *(in_ptr + in_stride_wc)
+ : const_border_value;
+ const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+ ? *(in_ptr + in_stride_c + in_stride_wc)
+ : const_border_value;
+
+ *reinterpret_cast<float16_t *>(out.ptr()) =
+ static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+ },
+ in, out);
}
- else if(border_mode == BorderMode::REPLICATE)
+ else if (border_mode == BorderMode::REPLICATE)
{
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+ auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
+ auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+ auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+ auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+ const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+ clamped_h * in_stride_wc);
+ const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+ clamped_h * in_stride_wc);
+ const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+ clamped_h1 * in_stride_wc);
+ const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+ clamped_h1 * in_stride_wc);
+
+ *reinterpret_cast<float16_t *>(out.ptr()) =
+ static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+ },
+ in, out);
}
else
{
ARM_COMPUTE_ERROR("Not implemented");
}
}
-}
+} // namespace
namespace cpu
{
-void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void fp16_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
- if(policy == InterpolationPolicy::BILINEAR)
+ if (policy == InterpolationPolicy::BILINEAR)
{
- fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+ align_corners, window);
}
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
@@ -172,4 +219,4 @@ void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, c
} // namespace cpu
} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/scale/neon/integer.cpp b/src/cpu/kernels/scale/neon/integer.cpp
index 2ab14cf83a..bbf92e0412 100644
--- a/src/cpu/kernels/scale/neon/integer.cpp
+++ b/src/cpu/kernels/scale/neon/integer.cpp
@@ -22,8 +22,9 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/utils/ScaleUtils.h"
#include "support/Rounding.h"
@@ -33,8 +34,12 @@ namespace arm_compute
{
namespace
{
-void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void u8_neon_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -54,43 +59,58 @@ void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- int32_t x = window_start_x;
- const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
- for(; x <= window_end_x - window_step_x; x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
- wrapper::vloadq(in_ptr + offset + offset_row + x));
- }
- for(; x < window_end_x; ++x)
- {
- *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
- }
- },
- out);
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ int32_t x = window_start_x;
+ const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
+ wrapper::vloadq(in_ptr + offset + offset_row + x));
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+ }
+ },
+ out);
}
-void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void u8_neon_scale_bilinear(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
// Compute the ratio between source and destination dimensions
- const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
- const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ const float scale_x =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+ const float scale_y =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
const int input_width = src->info()->dimension(1);
const int input_height = src->info()->dimension(2);
- if(border_mode == BorderMode::CONSTANT)
+ if (border_mode == BorderMode::CONSTANT)
{
Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const int in_stride_wc = in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom);
+ const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+ const int in_stride_wc =
+ in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom);
// Don't increment in Y and Z direction for the input tensor
// A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -100,24 +120,37 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
Iterator in(src, win_in);
const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset);
- const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int32_t in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset);
+ const uint8_t *in_ptr =
+ reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+ const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height)
+ ? *in_ptr
+ : const_border_value;
+ const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height)
+ ? *(in_ptr + in_stride_c)
+ : const_border_value;
+ const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1)
+ ? *(in_ptr + in_stride_wc)
+ : const_border_value;
+ const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1)
+ ? *(in_ptr + in_stride_c + in_stride_wc)
+ : const_border_value;
+
+ *reinterpret_cast<uint8_t *>(out.ptr()) =
+ static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+ },
+ in, out);
}
- else if(border_mode == BorderMode::REPLICATE)
+ else if (border_mode == BorderMode::REPLICATE)
{
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
@@ -152,12 +185,12 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
- for(int bo = bo_start; bo < bo_end; bo += bo_step)
+ for (int bo = bo_start; bo < bo_end; bo += bo_step)
{
const uint8_t *in_ptr = in.ptr() + bo * in_stride_b;
uint8_t *out_ptr = out.ptr() + bo * out_stride_b;
- for(int yo = yo_start; yo < yo_end; yo += yo_step)
+ for (int yo = yo_start; yo < yo_end; yo += yo_step)
{
// Floating-point coordinate
const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -174,7 +207,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
- for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ for (int xo = xo_start; xo < xo_end; xo += xo_step)
{
// Floating-point coordinate
const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -205,7 +238,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
int cout = 0;
- for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
{
const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -270,19 +303,21 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
const auto out_2_int = wrapper::vcvta<uint32_t>(out_2);
const auto out_3_int = wrapper::vcvta<uint32_t>(out_3);
#else // defined(__aarch64__) && !defined(BARE_METAL)
- const auto out_0_int = wrapper::vcvt<uint32_t>(out_0);
- const auto out_1_int = wrapper::vcvt<uint32_t>(out_1);
- const auto out_2_int = wrapper::vcvt<uint32_t>(out_2);
- const auto out_3_int = wrapper::vcvt<uint32_t>(out_3);
+ const auto out_0_int = wrapper::vcvt<uint32_t>(out_0);
+ const auto out_1_int = wrapper::vcvt<uint32_t>(out_1);
+ const auto out_2_int = wrapper::vcvt<uint32_t>(out_2);
+ const auto out_3_int = wrapper::vcvt<uint32_t>(out_3);
#endif // defined(__aarch64__) && !defined(BARE_METAL)
- const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
- const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
- const auto out = wrapper::vcombine(low_part, high_part);
+ const auto low_part =
+ wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+ const auto high_part =
+ wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+ const auto out = wrapper::vcombine(low_part, high_part);
wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
}
- for(; cout < out_dim_ch; ++cout)
+ for (; cout < out_dim_ch; ++cout)
{
const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -311,18 +346,27 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
}
}
-void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void s8_neon_scale_bilinear(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dx, dy, offsets, constant_border_value);
- if(border_mode == BorderMode::REPLICATE)
+ if (border_mode == BorderMode::REPLICATE)
{
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
// Compute the ratio between source and destination dimensions
- const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
- const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ const float scale_x =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+ const float scale_y =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
const int in_stride_x = src->info()->strides_in_bytes()[1];
const int in_stride_y = src->info()->strides_in_bytes()[2];
@@ -356,12 +400,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
- for(int bo = bo_start; bo < bo_end; bo += bo_step)
+ for (int bo = bo_start; bo < bo_end; bo += bo_step)
{
const int8_t *in_ptr = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
int8_t *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
- for(int yo = yo_start; yo < yo_end; yo += yo_step)
+ for (int yo = yo_start; yo < yo_end; yo += yo_step)
{
// Floating-point coordinate
const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -378,7 +422,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
- for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ for (int xo = xo_start; xo < xo_end; xo += xo_step)
{
// Floating-point coordinate
const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -409,7 +453,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
int cout = 0;
- for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
{
const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -479,14 +523,16 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
const auto out_2_int = wrapper::vcvt<int32_t>(out_2);
const auto out_3_int = wrapper::vcvt<int32_t>(out_3);
#endif // defined(__aarch64__) && !defined(BARE_METAL)
- const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
- const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
- const auto out = wrapper::vcombine(low_part, high_part);
+ const auto low_part =
+ wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+ const auto high_part =
+ wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+ const auto out = wrapper::vcombine(low_part, high_part);
wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
}
- for(; cout < out_dim_ch; ++cout)
+ for (; cout < out_dim_ch; ++cout)
{
const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -515,8 +561,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
}
}
-void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void s16_neon_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -536,33 +586,46 @@ void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- int32_t x = window_start_x;
- const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
- for(; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
- wrapper::vloadq(in_ptr + offset + offset_row + x));
- }
- for(; x < window_end_x; ++x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
- }
- },
- out);
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ int32_t x = window_start_x;
+ const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+ for (; x <= window_end_x - window_step_x; x += window_step_x)
+ {
+ wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
+ wrapper::vloadq(in_ptr + offset + offset_row + x));
+ }
+ for (; x < window_end_x; ++x)
+ {
+ *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+ }
+ },
+ out);
}
-void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void s16_neon_scale_bilinear(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
// Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ const auto hr =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
Iterator out(dst, window);
const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
@@ -577,64 +640,93 @@ void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *of
win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
Iterator in(src, win_in);
- if(border_mode == BorderMode::CONSTANT)
+ if (border_mode == BorderMode::CONSTANT)
{
const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+ const int16_t *in_ptr =
+ reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+ const auto a00 =
+ (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+ const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+ ? *(in_ptr + in_stride_c)
+ : const_border_value;
+ const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+ ? *(in_ptr + in_stride_wc)
+ : const_border_value;
+ const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+ ? *(in_ptr + in_stride_c + in_stride_wc)
+ : const_border_value;
+
+ *reinterpret_cast<int16_t *>(out.ptr()) =
+ static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+ },
+ in, out);
}
- else if(border_mode == BorderMode::REPLICATE)
+ else if (border_mode == BorderMode::REPLICATE)
{
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- const auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- const auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const auto offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+ const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+ const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+ const auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
+ const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+ const auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+ const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+ const auto a00 =
+ *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
+ const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+ clamped_h * in_stride_wc);
+ const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c +
+ clamped_h1 * in_stride_wc);
+ const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+ clamped_h1 * in_stride_wc);
+
+ *reinterpret_cast<int16_t *>(out.ptr()) =
+ static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+ },
+ in, out);
}
else
{
ARM_COMPUTE_ERROR("Not implemented");
}
}
-}
+} // namespace
namespace cpu
{
-void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void s8_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
- if(policy == InterpolationPolicy::BILINEAR)
+ if (policy == InterpolationPolicy::BILINEAR)
{
- s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+ align_corners, window);
}
else
{
@@ -642,32 +734,50 @@ void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, con
}
}
-void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void u8_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
- if(policy == InterpolationPolicy::BILINEAR)
+ if (policy == InterpolationPolicy::BILINEAR)
{
- u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+ align_corners, window);
}
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
}
-void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void s16_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
- if(policy == InterpolationPolicy::BILINEAR)
+ if (policy == InterpolationPolicy::BILINEAR)
{
- s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+ align_corners, window);
}
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h
index 28a1087224..0fe87d15a6 100644
--- a/src/cpu/kernels/scale/neon/list.h
+++ b/src/cpu/kernels/scale/neon/list.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/utils/ScaleUtils.h"
#include "support/Rounding.h"
@@ -34,10 +35,10 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_SCALE_KERNEL(func_name) \
- void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
- bool align_corners, const Window &window)
+#define DECLARE_SCALE_KERNEL(func_name) \
+ void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+ InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, \
+ float sampling_offset, bool align_corners, const Window &window)
DECLARE_SCALE_KERNEL(s16_neon_scale);
DECLARE_SCALE_KERNEL(u8_neon_scale);
@@ -48,14 +49,20 @@ DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale);
#undef DECLARE_SCALE_KERNEL
template <typename T>
-void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset,
- bool align_corners, const Window &window)
+void nearest_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(offsets);
// Compute the ratio between source and destination dimensions
- const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
- const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ const float scale_x =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+ const float scale_y =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
const int in_stride_y = src->info()->strides_in_bytes()[1];
const int in_stride_z = src->info()->strides_in_bytes()[2];
@@ -84,17 +91,17 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
const int bo_end = window_execution[3].end();
const int bo_step = window_execution[3].step();
- for(int bo = bo_start; bo < bo_end; bo += bo_step)
+ for (int bo = bo_start; bo < bo_end; bo += bo_step)
{
const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w;
uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w;
- for(int yo = yo_start; yo < yo_end; yo += yo_step)
+ for (int yo = yo_start; yo < yo_end; yo += yo_step)
{
// Floating-point coordinate
float yi_f = ((yo + sampling_offset) * scale_y);
int yi = 0;
- if(align_corners)
+ if (align_corners)
{
yi = utils::rounding::round_half_away_from_zero(yi_f);
}
@@ -103,12 +110,12 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
yi = static_cast<int>(std::floor(yi_f));
}
- for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ for (int xo = xo_start; xo < xo_end; xo += xo_step)
{
// Floating-point coordinate
float xi_f = ((xo + sampling_offset) * scale_x);
int xi = 0;
- if(align_corners)
+ if (align_corners)
{
xi = utils::rounding::round_half_away_from_zero(xi_f);
}
@@ -121,15 +128,15 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
int cout = 0;
- for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
{
auto out0 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
}
- for(; cout < out_dim_ch; ++cout)
+ for (; cout < out_dim_ch; ++cout)
{
- auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+ auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
*(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
}
}
@@ -138,9 +145,16 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
}
template <typename T>
-void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void bilinear_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(offsets);
ARM_COMPUTE_UNUSED(dx);
@@ -148,8 +162,10 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
// Compute the ratio between source and destination dimensions
- const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
- const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ const float scale_x =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+ const float scale_y =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
const int in_stride_y = src->info()->strides_in_bytes()[1];
const int in_stride_z = src->info()->strides_in_bytes()[2];
@@ -180,7 +196,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
const int bo_end = window_execution[3].end();
const int bo_step = window_execution[3].step();
- if(border_mode == BorderMode::CONSTANT)
+ if (border_mode == BorderMode::CONSTANT)
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -189,12 +205,12 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
- for(int bo = bo_start; bo < bo_end; bo += bo_step)
+ for (int bo = bo_start; bo < bo_end; bo += bo_step)
{
const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w;
uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w;
- for(int yo = yo_start; yo < yo_end; yo += yo_step)
+ for (int yo = yo_start; yo < yo_end; yo += yo_step)
{
// Floating-point coordinate
const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
@@ -204,7 +220,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
const auto a1 = (yi_f - static_cast<float>(yi));
const auto b1 = (1.f - a1);
- for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ for (int xo = xo_start; xo < xo_end; xo += xo_step)
{
// Floating-point coordinate
const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
@@ -223,32 +239,35 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
int cout = 0;
- for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
{
auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
- if((yi >= 0) && (yi < in_dim_h))
+ if ((yi >= 0) && (yi < in_dim_h))
{
- if((xi >= 0) && (xi < in_dim_w))
+ if ((xi >= 0) && (xi < in_dim_w))
{
in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
}
- if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+ if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
{
- in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
+ in01 = wrapper::vloadq(
+ reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
}
}
- if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+ if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
{
- if((xi >= 0) && (xi < in_dim_w))
+ if ((xi >= 0) && (xi < in_dim_w))
{
- in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
+ in10 = wrapper::vloadq(
+ reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
}
- if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+ if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
{
- in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+ in11 = wrapper::vloadq(
+ reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
}
}
@@ -264,32 +283,33 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
}
- for(; cout < out_dim_ch; ++cout)
+ for (; cout < out_dim_ch; ++cout)
{
auto in00 = static_cast<T>(const_border_value);
auto in01 = static_cast<T>(const_border_value);
auto in10 = static_cast<T>(const_border_value);
auto in11 = static_cast<T>(const_border_value);
- if((yi >= 0) && (yi < in_dim_h))
+ if ((yi >= 0) && (yi < in_dim_h))
{
- if((xi >= 0) && (xi < in_dim_w))
+ if ((xi >= 0) && (xi < in_dim_w))
{
in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
}
- if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+ if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
{
in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
}
}
- if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+ if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
{
- if((xi >= 0) && (xi < in_dim_w))
+ if ((xi >= 0) && (xi < in_dim_w))
{
in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
}
- if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+ if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
{
- in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+ in11 = *(
+ reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
}
}
auto out0 = static_cast<T>(0);
@@ -303,14 +323,14 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
}
}
}
- else if(border_mode == BorderMode::REPLICATE)
+ else if (border_mode == BorderMode::REPLICATE)
{
- for(int bo = bo_start; bo < bo_end; bo += bo_step)
+ for (int bo = bo_start; bo < bo_end; bo += bo_step)
{
const uint8_t *in_ptr = in.ptr() + bo * in_stride_w;
uint8_t *out_ptr = out.ptr() + bo * out_stride_w;
- for(int yo = yo_start; yo < yo_end; yo += yo_step)
+ for (int yo = yo_start; yo < yo_end; yo += yo_step)
{
// Floating-point coordinate
const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
@@ -327,7 +347,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
const int yi1_offset = yi1 * in_stride_z;
const int y_offset = yo * out_stride_z;
- for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ for (int xo = xo_start; xo < xo_end; xo += xo_step)
{
// Floating-point coordinate
const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
@@ -356,12 +376,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
const int offset = xo * out_stride_y + y_offset;
int cout = 0;
- for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
{
- const auto in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
- const auto in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
- const auto in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
- const auto in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+ const auto in00 = wrapper::vloadq(
+ reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+ const auto in01 = wrapper::vloadq(
+ reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+ const auto in10 = wrapper::vloadq(
+ reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+ const auto in11 = wrapper::vloadq(
+ reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
auto out0 = wrapper::vmul(in00, s00);
out0 = wrapper::vmla(out0, in01, s01);
@@ -370,12 +394,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
wrapper::vstore(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T)), out0);
}
- for(; cout < out_dim_ch; ++cout)
+ for (; cout < out_dim_ch; ++cout)
{
- const T in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
- const T in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
- const T in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
- const T in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+ const T in00 =
+ *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+ const T in01 =
+ *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+ const T in10 =
+ *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+ const T in11 =
+ *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
T out0 = in00 * s00_s;
out0 += in01 * s01_s;
@@ -394,15 +422,24 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
}
template <typename T>
-void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void common_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
- if(policy == InterpolationPolicy::BILINEAR)
+ if (policy == InterpolationPolicy::BILINEAR)
{
- bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+ align_corners, window);
}
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
}
diff --git a/src/cpu/kernels/scale/neon/qasymm8.cpp b/src/cpu/kernels/scale/neon/qasymm8.cpp
index 778459ae39..62a821daa5 100644
--- a/src/cpu/kernels/scale/neon/qasymm8.cpp
+++ b/src/cpu/kernels/scale/neon/qasymm8.cpp
@@ -28,9 +28,16 @@ namespace arm_compute
{
namespace
{
-void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void qasymm8_neon_scale_bilinear(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
// Data layout is NHWC
const int32_t input_width = src->info()->dimension(1);
@@ -40,10 +47,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
// Compute the ratio between source and destination dimensions
- const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
- const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ const float scale_x =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+ const float scale_y =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
- if(border_mode == BorderMode::CONSTANT)
+ if (border_mode == BorderMode::CONSTANT)
{
const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
@@ -59,7 +68,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
win_in.set(1, Window::Dimension(0, 0, 0));
win_in.set(2, Window::Dimension(0, 0, 0));
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+ for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
{
win_off.set(d, Window::Dimension(0, 0, 0));
}
@@ -68,36 +77,41 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
Iterator out(dst, window);
const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
- const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ?
- (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) :
- const_border_value;
- const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ?
- (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) :
- const_border_value;
- const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ?
- (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ?
- (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) :
- const_border_value;
-
- const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+ const int32_t index_w =
+ *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+ const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+ const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+ const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+
+ const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+ ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+ : const_border_value;
+ const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+ ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+ : const_border_value;
+ const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+ ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+ : const_border_value;
+ const auto a11 =
+ (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+ ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+ : const_border_value;
+
+ const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
+ const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
+ const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
+ const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
+ *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(
+ scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+ },
+ in, out);
}
- else if(border_mode == BorderMode::REPLICATE)
+ else if (border_mode == BorderMode::REPLICATE)
{
using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
@@ -141,12 +155,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
- for(int bo = bo_start; bo < bo_end; bo += bo_step)
+ for (int bo = bo_start; bo < bo_end; bo += bo_step)
{
const uint8_t *in_ptr = in.ptr() + bo * in_stride_b;
uint8_t *out_ptr = out.ptr() + bo * out_stride_b;
- for(int yo = yo_start; yo < yo_end; yo += yo_step)
+ for (int yo = yo_start; yo < yo_end; yo += yo_step)
{
// Floating-point coordinate
const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -163,7 +177,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
- for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ for (int xo = xo_start; xo < xo_end; xo += xo_step)
{
// Floating-point coordinate
const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -194,7 +208,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
int cout = 0;
- for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
{
const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -204,34 +218,82 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
const uint16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00));
const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
- const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)), vscale_in);
- const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)), vscale_in);
- const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)), vscale_in);
- const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)), vscale_in);
+ const auto in00_0 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)),
+ vscale_in);
+ const auto in00_1 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)),
+ vscale_in);
+ const auto in00_2 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)),
+ vscale_in);
+ const auto in00_3 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)),
+ vscale_in);
const uint16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01));
const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
- const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)), vscale_in);
- const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)), vscale_in);
- const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)), vscale_in);
- const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)), vscale_in);
+ const auto in01_0 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)),
+ vscale_in);
+ const auto in01_1 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)),
+ vscale_in);
+ const auto in01_2 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)),
+ vscale_in);
+ const auto in01_3 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)),
+ vscale_in);
const uint16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10));
const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
- const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)), vscale_in);
- const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)), vscale_in);
- const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)), vscale_in);
- const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)), vscale_in);
+ const auto in10_0 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)),
+ vscale_in);
+ const auto in10_1 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)),
+ vscale_in);
+ const auto in10_2 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)),
+ vscale_in);
+ const auto in10_3 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)),
+ vscale_in);
const uint16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11));
const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
- const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)), vscale_in);
- const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)), vscale_in);
- const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)), vscale_in);
- const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)), vscale_in);
+ const auto in11_0 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)),
+ vscale_in);
+ const auto in11_1 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)),
+ vscale_in);
+ const auto in11_2 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)),
+ vscale_in);
+ const auto in11_3 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)),
+ vscale_in);
auto out_0 = wrapper::vmul(in00_0, s00);
out_0 = wrapper::vmla(out_0, in01_0, s01);
@@ -264,14 +326,16 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
const auto out_2_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
const auto out_3_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
#endif // defined(__aarch64__) && !defined(BARE_METAL)
- const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
- const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
- const auto out = wrapper::vcombine(low_part, high_part);
+ const auto low_part =
+ wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+ const auto high_part =
+ wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+ const auto out = wrapper::vcombine(low_part, high_part);
wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
}
- for(; cout < out_dim_ch; ++cout)
+ for (; cout < out_dim_ch; ++cout)
{
const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -292,7 +356,8 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
#if defined(__aarch64__) && !defined(BARE_METAL)
*(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info);
#else // defined(__aarch64__) && !defined(BARE_METAL)
- *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO);
+ *(out_ptr_xo_yo + cout * sizeof(uint8_t)) =
+ quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO);
#endif // defined(__aarch64__) && !defined(BARE_METAL)
}
}
@@ -304,28 +369,38 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
ARM_COMPUTE_ERROR("Not implemented");
}
}
-}
+} // namespace
namespace cpu
{
-void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void qasymm8_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
- if(policy == InterpolationPolicy::BILINEAR)
+ if (policy == InterpolationPolicy::BILINEAR)
{
- if(src->info()->quantization_info() == dst->info()->quantization_info())
+ if (src->info()->quantization_info() == dst->info()->quantization_info())
{
- u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+ align_corners, window);
}
else
{
- qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+ align_corners, window);
}
}
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window);
}
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
index cd63dfba63..5a885178a7 100644
--- a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
@@ -28,9 +28,16 @@ namespace arm_compute
{
namespace
{
-void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void qasymm8_signed_neon_scale_bilinear(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
// Data layout is NHWC
const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
@@ -40,10 +47,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
const int32_t input_height = src->info()->dimension(2);
// Compute the ratio between source and destination dimensions
- const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
- const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+ const float scale_x =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+ const float scale_y =
+ scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
- if(border_mode == BorderMode::CONSTANT)
+ if (border_mode == BorderMode::CONSTANT)
{
const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
@@ -58,7 +67,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
win_in.set(1, Window::Dimension(0, 0, 0));
win_in.set(2, Window::Dimension(0, 0, 0));
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+ for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
{
win_off.set(d, Window::Dimension(0, 0, 0));
}
@@ -67,36 +76,41 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
Iterator out(dst, window);
const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
- const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ?
- (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) :
- const_border_value;
- const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ?
- (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) :
- const_border_value;
- const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ?
- (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ?
- (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) :
- const_border_value;
-
- const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+ const int32_t index_w =
+ *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+ const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+ const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+ const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
+
+ const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+ ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+ : const_border_value;
+ const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+ ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+ : const_border_value;
+ const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+ ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+ : const_border_value;
+ const auto a11 =
+ (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+ ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+ : const_border_value;
+
+ const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
+ const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
+ const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
+ const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
+ *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(
+ scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+ },
+ in, out);
}
- else if(border_mode == BorderMode::REPLICATE)
+ else if (border_mode == BorderMode::REPLICATE)
{
using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
@@ -140,12 +154,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{});
const float32x4_t voffset_o = vdupq_n_f32(oq_info.offset);
- for(int bo = bo_start; bo < bo_end; bo += bo_step)
+ for (int bo = bo_start; bo < bo_end; bo += bo_step)
{
const int8_t *in_ptr = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
int8_t *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
- for(int yo = yo_start; yo < yo_end; yo += yo_step)
+ for (int yo = yo_start; yo < yo_end; yo += yo_step)
{
// Floating-point coordinate
const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -162,7 +176,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
- for(int xo = xo_start; xo < xo_end; xo += xo_step)
+ for (int xo = xo_start; xo < xo_end; xo += xo_step)
{
// Floating-point coordinate
const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -193,7 +207,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
int cout = 0;
- for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+ for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
{
const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -203,34 +217,70 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
const int16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00));
const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
- const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)), vscale_in);
- const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)), vscale_in);
- const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)), vscale_in);
- const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)), vscale_in);
+ const auto in00_0 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)),
+ vscale_in);
+ const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)),
+ vscale_in);
+ const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)),
+ vscale_in);
+ const auto in00_3 =
+ wrapper::vmul(wrapper::vcvt<float>(
+ wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)),
+ vscale_in);
const int16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01));
const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
- const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)), vscale_in);
- const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)), vscale_in);
- const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)), vscale_in);
- const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)), vscale_in);
+ const auto in01_0 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)),
+ vscale_in);
+ const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)),
+ vscale_in);
+ const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)),
+ vscale_in);
+ const auto in01_3 =
+ wrapper::vmul(wrapper::vcvt<float>(
+ wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)),
+ vscale_in);
const int16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10));
const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
- const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)), vscale_in);
- const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)), vscale_in);
- const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)), vscale_in);
- const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)), vscale_in);
+ const auto in10_0 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)),
+ vscale_in);
+ const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)),
+ vscale_in);
+ const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)),
+ vscale_in);
+ const auto in10_3 =
+ wrapper::vmul(wrapper::vcvt<float>(
+ wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)),
+ vscale_in);
const int16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11));
const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
- const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)), vscale_in);
- const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)), vscale_in);
- const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)), vscale_in);
- const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)), vscale_in);
+ const auto in11_0 = wrapper::vmul(
+ wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)),
+ vscale_in);
+ const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)),
+ vscale_in);
+ const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+ wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)),
+ vscale_in);
+ const auto in11_3 =
+ wrapper::vmul(wrapper::vcvt<float>(
+ wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)),
+ vscale_in);
auto out_0 = wrapper::vmul(in00_0, s00);
out_0 = wrapper::vmla(out_0, in01_0, s01);
@@ -263,14 +313,16 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
const auto out_2_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
const auto out_3_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
#endif // defined(__aarch64__) && !defined(BARE_METAL)
- const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
- const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
- const auto out = wrapper::vcombine(low_part, high_part);
+ const auto low_part =
+ wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+ const auto high_part =
+ wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+ const auto out = wrapper::vcombine(low_part, high_part);
wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
}
- for(; cout < out_dim_ch; ++cout)
+ for (; cout < out_dim_ch; ++cout)
{
const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -291,7 +343,8 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
#if defined(__aarch64__) && !defined(BARE_METAL)
*(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info);
#else // defined(__aarch64__) && !defined(BARE_METAL)
- *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO);
+ *(out_ptr_xo_yo + cout * sizeof(int8_t)) =
+ quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO);
#endif // defined(__aarch64__) && !defined(BARE_METAL)
}
}
@@ -303,28 +356,39 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
ARM_COMPUTE_ERROR("Not implemented");
}
}
-}
+} // namespace
namespace cpu
{
-void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void qasymm8_signed_neon_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
- if(policy == InterpolationPolicy::BILINEAR)
+ if (policy == InterpolationPolicy::BILINEAR)
{
- if(src->info()->quantization_info() == dst->info()->quantization_info() && border_mode == BorderMode::REPLICATE)
+ if (src->info()->quantization_info() == dst->info()->quantization_info() &&
+ border_mode == BorderMode::REPLICATE)
{
- s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+ align_corners, window);
}
else
{
- qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+ qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value,
+ sampling_offset, align_corners, window);
}
}
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window);
}
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp
index ceda19f366..cb28f4cb1c 100644
--- a/src/cpu/kernels/scale/sve/fp16.cpp
+++ b/src/cpu/kernels/scale/sve/fp16.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/utils/ScaleUtils.h"
#include "support/Rounding.h"
@@ -41,8 +42,12 @@ namespace arm_compute
{
namespace
{
-void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void fp16_sve_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -61,38 +66,50 @@ void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- // Store results
- svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ const auto in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+ const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
- x += svcntw();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- out);
-}
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
+ {
+ // Store results
+ svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
+
+ x += svcntw();
+ pg = svwhilelt_b16(x, window_end_x);
+ } while (svptest_any(svptrue_b16(), pg));
+ },
+ out);
}
+} // namespace
namespace cpu
{
-void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void fp16_sve_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
- if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
@@ -103,4 +120,4 @@ void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co
}
} // namespace cpu
} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp
index f3472f1efd..cbb345edbb 100644
--- a/src/cpu/kernels/scale/sve/fp32.cpp
+++ b/src/cpu/kernels/scale/sve/fp32.cpp
@@ -25,23 +25,27 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/utils/ScaleUtils.h"
#include "support/Rounding.h"
+#include <arm_sve.h>
#include <cmath>
#include <cstddef>
-#include <arm_sve.h>
-
namespace arm_compute
{
namespace
{
-void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void fp32_sve_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@ void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b32(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- // Store results
- svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ const auto in_ptr = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+ const auto out_ptr = reinterpret_cast<float *>(out.ptr());
- x += svcntw();
- pg = svwhilelt_b32(x, window_end_x);
- }
- while(svptest_any(svptrue_b32(), pg));
- },
- out);
-}
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b32(x, window_end_x);
+ do
+ {
+ // Store results
+ svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
+
+ x += svcntw();
+ pg = svwhilelt_b32(x, window_end_x);
+ } while (svptest_any(svptrue_b32(), pg));
+ },
+ out);
}
+} // namespace
namespace cpu
{
-void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void fp32_sve_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
- if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
diff --git a/src/cpu/kernels/scale/sve/integer.cpp b/src/cpu/kernels/scale/sve/integer.cpp
index 82c70ee360..df950b1789 100644
--- a/src/cpu/kernels/scale/sve/integer.cpp
+++ b/src/cpu/kernels/scale/sve/integer.cpp
@@ -25,9 +25,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/utils/ScaleUtils.h"
#include "support/Rounding.h"
@@ -39,8 +40,12 @@ namespace arm_compute
{
namespace
{
-void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void u8_sve_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -59,32 +64,40 @@ void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offse
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- // Store results
- svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
-
- x += svcntw();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- out);
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+ const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
+ {
+ // Store results
+ svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+ x += svcntw();
+ pg = svwhilelt_b8(x, window_end_x);
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ out);
}
-void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void s16_sve_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -103,38 +116,50 @@ void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<int16_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- // Store results
- svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
-
- x += svcntw();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- out);
-}
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ const auto in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+ const auto out_ptr = reinterpret_cast<int16_t *>(out.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
+ {
+ // Store results
+ svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
+
+ x += svcntw();
+ pg = svwhilelt_b16(x, window_end_x);
+ } while (svptest_any(svptrue_b16(), pg));
+ },
+ out);
}
+} // namespace
namespace cpu
{
-void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void u8_sve_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
- if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
@@ -144,12 +169,20 @@ void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, cons
}
}
-void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void s16_sve_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
- if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
diff --git a/src/cpu/kernels/scale/sve/list.h b/src/cpu/kernels/scale/sve/list.h
index b9c3a10a78..aff741a4a7 100644
--- a/src/cpu/kernels/scale/sve/list.h
+++ b/src/cpu/kernels/scale/sve/list.h
@@ -28,10 +28,10 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_SCALE_KERNEL(func_name) \
- void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
- bool align_corners, const Window &window)
+#define DECLARE_SCALE_KERNEL(func_name) \
+ void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+ InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, \
+ float sampling_offset, bool align_corners, const Window &window)
DECLARE_SCALE_KERNEL(fp16_sve_scale);
DECLARE_SCALE_KERNEL(fp32_sve_scale);
diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp
index d45a69e43b..0fc794c6c2 100644
--- a/src/cpu/kernels/scale/sve/qasymm8.cpp
+++ b/src/cpu/kernels/scale/sve/qasymm8.cpp
@@ -25,10 +25,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/utils/ScaleUtils.h"
#include "support/Rounding.h"
@@ -40,8 +40,12 @@ namespace arm_compute
{
namespace
{
-void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void qasymm8_sve_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@ void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- // Store results
- svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+ const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
- x += svcntw();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- out);
-}
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
+ {
+ // Store results
+ svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+ x += svcntw();
+ pg = svwhilelt_b8(x, window_end_x);
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ out);
}
+} // namespace
namespace cpu
{
-void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void qasymm8_sve_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
- if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
diff --git a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
index 67bca65f58..68ea01e29e 100644
--- a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
@@ -25,10 +25,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
#include "src/core/utils/ScaleUtils.h"
#include "support/Rounding.h"
@@ -40,8 +40,12 @@ namespace arm_compute
{
namespace
{
-void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
+void qasymm8_signed_sve_scale_nearest(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@ void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const IT
const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<int8_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- // Store results
- svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
+ const int32_t offset =
+ *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+ const auto in_hi = static_cast<int>(
+ align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+ : std::floor((id.z() + sampling_offset) * hr));
+ const int offset_row = in_hi * in_stride_wc;
+ const auto in_ptr = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+ const auto out_ptr = reinterpret_cast<int8_t *>(out.ptr());
- x += svcntw();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- out);
-}
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b8(x, window_end_x);
+ do
+ {
+ // Store results
+ svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
+
+ x += svcntw();
+ pg = svwhilelt_b8(x, window_end_x);
+ } while (svptest_any(svptrue_b8(), pg));
+ },
+ out);
}
+} // namespace
namespace cpu
{
-void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
+void qasymm8_signed_sve_scale(const ITensor *src,
+ ITensor *dst,
+ const ITensor *offsets,
+ const ITensor *dx,
+ const ITensor *dy,
+ InterpolationPolicy policy,
+ BorderMode border_mode,
+ PixelValue constant_border_value,
+ float sampling_offset,
+ bool align_corners,
+ const Window &window)
{
ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
- if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+ if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
}
diff --git a/src/cpu/kernels/select/generic/neon/fp16.cpp b/src/cpu/kernels/select/generic/neon/fp16.cpp
index b460213c72..38a58099bd 100644
--- a/src/cpu/kernels/select/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/select/generic/neon/fp16.cpp
@@ -23,20 +23,22 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_f16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f16_select_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_16<float16_t, uint16x8_t>(c, x, y, output, window);
}
-void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f16_select_not_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_not_same_rank<float16_t>(c, x, y, output, window);
}
@@ -45,4 +47,4 @@ void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITe
} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/select/generic/neon/fp32.cpp b/src/cpu/kernels/select/generic/neon/fp32.cpp
index 63fd594901..50a80cb338 100644
--- a/src/cpu/kernels/select/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/select/generic/neon/fp32.cpp
@@ -22,20 +22,22 @@
* SOFTWARE.
*/
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_f32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f32_select_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_32<float, uint32x4_t>(c, x, y, output, window);
}
-void neon_f32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f32_select_not_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_not_same_rank<float>(c, x, y, output, window);
}
diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h
index 6a6d9969f8..7ce640b6ff 100644
--- a/src/cpu/kernels/select/generic/neon/impl.h
+++ b/src/cpu/kernels/select/generic/neon/impl.h
@@ -25,6 +25,7 @@
#define ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
#include "arm_compute/core/TensorInfo.h"
+
#include "src/core/NEON/NEAsymm.h"
#include "src/cpu/kernels/select/generic/neon/impl.h"
@@ -37,8 +38,16 @@ namespace arm_compute
namespace cpu
{
template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
+void select_op(const ITensor *cond,
+ const ITensor *in1,
+ const ITensor *in2,
+ ITensor *out,
+ const Window &window,
+ const int window_step_x,
+ const int window_start_x,
+ const int window_end_x,
+ const int limit,
+ VectorType (*condition_conversion)(const uint8_t *))
{
Window win = window;
win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -48,30 +57,32 @@ void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITen
Iterator input2(in2, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
- const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-
- int x = window_start_x;
- for(; x <= limit; x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const auto c = (*condition_conversion)(condition_ptr + x);
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
- }
- for(; x < window_end_x; ++x)
- {
- const auto c = *(condition_ptr + x);
- const auto a = *(input1_ptr + x);
- const auto b = *(input2_ptr + x);
- *(output_ptr + x) = static_cast<bool>(c) ? a : b;
- }
- },
- condition, input1, input2, output);
+ auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
+ const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+ int x = window_start_x;
+ for (; x <= limit; x += window_step_x)
+ {
+ const auto c = (*condition_conversion)(condition_ptr + x);
+ const auto a = wrapper::vloadq(input1_ptr + x);
+ const auto b = wrapper::vloadq(input2_ptr + x);
+ wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
+ }
+ for (; x < window_end_x; ++x)
+ {
+ const auto c = *(condition_ptr + x);
+ const auto a = *(input1_ptr + x);
+ const auto b = *(input2_ptr + x);
+ *(output_ptr + x) = static_cast<bool>(c) ? a : b;
+ }
+ },
+ condition, input1, input2, output);
}
template <typename ScalarType, typename VectorType>
@@ -81,11 +92,14 @@ void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, IT
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
- {
- static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
- return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
- });
+ select_op<ScalarType, VectorType>(
+ cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+ [](const uint8_t *condition_ptr) -> VectorType
+ {
+ static const auto zero =
+ wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+ return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
+ });
}
template <typename ScalarType, typename VectorType>
@@ -95,11 +109,14 @@ void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, I
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
- {
- static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
- return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
- });
+ select_op<ScalarType, VectorType>(
+ cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+ [](const uint8_t *condition_ptr) -> VectorType
+ {
+ static const auto zero =
+ wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+ return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+ });
}
template <typename ScalarType, typename VectorType>
@@ -109,15 +126,19 @@ void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, I
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
- {
- static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
- return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
- });
+ select_op<ScalarType, VectorType>(
+ cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+ [](const uint8_t *condition_ptr) -> VectorType
+ {
+ static const auto zero =
+ wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+ return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+ });
}
template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void select_op_not_same_rank(
+ const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
ARM_COMPUTE_UNUSED(window);
@@ -131,20 +152,20 @@ void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITen
int offset = 0;
const int step = 16 / in1->info()->element_size();
- for(int i = 0; i < outer_size; ++i)
+ for (int i = 0; i < outer_size; ++i)
{
int x = offset;
const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
- for(; x <= offset + inner_size - step; x += step)
+ for (; x <= offset + inner_size - step; x += step)
{
wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
}
- if(x <= offset + inner_size - (step / 2))
+ if (x <= offset + inner_size - (step / 2))
{
wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
x += step / 2;
}
- for(; x < offset + inner_size; ++x)
+ for (; x < offset + inner_size; ++x)
{
*(output_ptr + x) = *(input_ptr + x);
}
diff --git a/src/cpu/kernels/select/generic/neon/integer.cpp b/src/cpu/kernels/select/generic/neon/integer.cpp
index 71b2f0b933..135087c261 100644
--- a/src/cpu/kernels/select/generic/neon/integer.cpp
+++ b/src/cpu/kernels/select/generic/neon/integer.cpp
@@ -25,59 +25,71 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include <arm_neon.h>
-
#include "src/cpu/kernels/select/generic/neon/impl.h"
+#include <arm_neon.h>
+
namespace arm_compute
{
namespace cpu
{
-void neon_s8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s8_select_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_8<int8_t, uint8x16_t>(c, x, y, output, window);
}
-void neon_s16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s16_select_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_16<int16_t, uint16x8_t>(c, x, y, output, window);
}
-void neon_s32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s32_select_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_32<int32_t, uint32x4_t>(c, x, y, output, window);
}
-void neon_s8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s8_select_not_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_not_same_rank<int8_t>(c, x, y, output, window);
}
-void neon_s16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s16_select_not_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_not_same_rank<int16_t>(c, x, y, output, window);
}
-void neon_s32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s32_select_not_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_not_same_rank<int32_t>(c, x, y, output, window);
}
-void neon_u8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u8_select_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_8<uint8_t, uint8x16_t>(c, x, y, output, window);
}
-void neon_u16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u16_select_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_16<uint16_t, uint16x8_t>(c, x, y, output, window);
}
-void neon_u32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u32_select_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_32<uint32_t, uint32x4_t>(c, x, y, output, window);
}
-void neon_u8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u8_select_not_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_not_same_rank<uint8_t>(c, x, y, output, window);
}
-void neon_u16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u16_select_not_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_not_same_rank<uint16_t>(c, x, y, output, window);
}
-void neon_u32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u32_select_not_same_rank(
+ const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
{
return select_op_not_same_rank<uint32_t>(c, x, y, output, window);
}
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index f6556696b0..2e2adf33e0 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/CpuTypes.h"
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
@@ -30,8 +31,13 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp16_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
}
@@ -40,6 +46,6 @@ void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
{
return neon_logits_1d_max<float16_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index ddd270ae70..61df40c1b5 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -22,14 +22,20 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp32_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
}
@@ -38,5 +44,5 @@ void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
{
return neon_logits_1d_max<float>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index f07fd2fb27..5d6e6a4f80 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
#include "support/SaturateCast.h"
namespace arm_compute
@@ -32,11 +33,10 @@ template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *o
template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window)
+void neon_softmax_logits_1d_quantized(
+ const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
{
- static_assert(std::is_same<T, qasymm8_t>::value
- || std::is_same<T, qasymm8_signed_t>::value,
+ static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
"quantized type should be either qasymm8_t or qasymm8_signed_t.");
const int start_x = in->info()->valid_region().anchor.x();
@@ -50,163 +50,174 @@ void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi
Iterator out_it(out, window);
constexpr int vec_size = 16;
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
- float sum{};
- float sum_inversed{};
-
- /* Compute exponentials and sum */
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+ /* Get pointers */
+ const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+ const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+ const auto tmp_ptr = reinterpret_cast<float *>(tmp);
- /* Init sum to zero */
- float32x4x4_t vec_sum =
- {
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- };
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vqsub(vec_max, vec_elements);
- auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+ float sum{};
+ float sum_inversed{};
- if(is_log)
- {
- vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
- vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
- vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
- vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
- vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
- vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
- vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
- vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
- }
- else
+ /* Compute exponentials and sum */
+ {
+ /* Get max value */
+ const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+ const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+
+ /* Init sum to zero */
+ float32x4x4_t vec_sum = {
+ vdupq_n_f32(0.f),
+ vdupq_n_f32(0.f),
+ vdupq_n_f32(0.f),
+ vdupq_n_f32(0.f),
+ };
+
+ /* Loop over row and compute exponentials and sum */
+ int x = 0;
+ for (; x <= (input_width - vec_size); x += vec_size)
{
- vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
- vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
- vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
- vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
- vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
- vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
- vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
- vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+ auto vec_elements = wrapper::vloadq(in_ptr + x);
+ vec_elements = wrapper::vqsub(vec_max, vec_elements);
+ auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+
+ if (is_log)
+ {
+ vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
+ vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
+ vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
+ vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
+ vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
+ vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
+ vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
+ vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
+ }
+ else
+ {
+ vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
+ vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
+ vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
+ vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
+ vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
+ vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
+ vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
+ vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+ }
+
+ vst4q_f32(tmp_ptr + x, vec_elements_flt);
}
- vst4q_f32(tmp_ptr + x, vec_elements_flt);
- }
+ /* Reduce sum */
+ const auto sum_16_byte =
+ vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+ auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
+ sum_res = vpadd_f32(sum_res, sum_res);
+ sum = wrapper::vgetlane(sum_res, 0);
- /* Reduce sum */
- const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
- auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
- sum_res = vpadd_f32(sum_res, sum_res);
- sum = wrapper::vgetlane(sum_res, 0);
+ /* Run remaining elements */
+ for (; x < input_width; ++x)
+ {
+ float element{};
+ if (is_log)
+ {
+ element = (max_val - in_ptr[x]) * scale_beta;
+ sum += std::exp(element);
+ }
+ else
+ {
+ element = std::exp((max_val - in_ptr[x]) * scale_beta);
+ sum += element;
+ }
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- float element{};
- if(is_log)
+ tmp_ptr[x] = element;
+ }
+
+ if (!is_log)
{
- element = (max_val - in_ptr[x]) * scale_beta;
- sum += std::exp(element);
+ sum_inversed = 256.f / sum;
}
else
{
- element = std::exp((max_val - in_ptr[x]) * scale_beta);
- sum += element;
+ sum = std::log(sum);
}
-
- tmp_ptr[x] = element;
}
- if(!is_log)
- {
- sum_inversed = 256.f / sum;
- }
- else
+ /* Normalize exponentials */
{
- sum = std::log(sum);
- }
- }
-
- /* Normalize exponentials */
- {
- constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
- /* Loop over row and compute softmax */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
- float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
- int_vec_type normalized_value{};
- if(is_log)
+ constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+ /* Loop over row and compute softmax */
+ int x = 0;
+ for (; x <= (input_width - vec_size); x += vec_size)
{
- const float32x4x4_t sub =
+ using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
+ float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
+ int_vec_type normalized_value{};
+ if (is_log)
{
- vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
- };
- normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+ const float32x4x4_t sub = {
+ vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
+ vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
+ vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
+ vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+ };
+ normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+ }
+ else
+ {
+ float32x4x4_t mul = {
+ vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
+ vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
+ vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
+ vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+ };
+
+ if (is_qasymm8_signed)
+ {
+ const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
+ mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
+ mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
+ mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
+ mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
+ }
+
+ normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
+ }
+ wrapper::vstore(out_ptr + x, normalized_value);
}
- else
+ /* Run remaining elements */
+ for (; x < input_width; ++x)
{
- float32x4x4_t mul =
+ if (is_log)
{
- vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
- };
-
- if(is_qasymm8_signed)
+ out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+ }
+ else
{
- const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
- mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
- mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
- mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
- mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
+ out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+ (is_qasymm8_signed ? 128.f : 0));
}
-
- normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
- }
- wrapper::vstore(out_ptr + x, normalized_value);
- }
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- if(is_log)
- {
- out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
- }
- else
- {
- out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
}
}
- }
- },
- in_it, max_it, out_it);
+ },
+ in_it, max_it, out_it);
}
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
+template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
+template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 206d36a2e0..4d9b789297 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -25,6 +25,7 @@
#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
#include "arm_compute/core/Helpers.h"
+
#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
@@ -42,53 +43,65 @@ void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input(in, win);
Iterator output(out, win);
const int sum_stages = log2(window_step_x / 2);
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Get pointers
- const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ // Get pointers
+ const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<T *>(output.ptr());
- // Init max value
- auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
- int x = window_start_x;
+ // Init max value
+ auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+ int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto current_value = wrapper::vloadq(in_ptr + x);
- vec_max = wrapper::vmax(vec_max, current_value);
- }
- auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto current_value = wrapper::vloadq(in_ptr + x);
+ vec_max = wrapper::vmax(vec_max, current_value);
+ }
+ auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
- for(int i = 0; i < sum_stages; ++i)
- {
- carry_max = wrapper::vpmax(carry_max, carry_max);
- }
- T max_val = wrapper::vgetlane(carry_max, 0);
+ for (int i = 0; i < sum_stages; ++i)
+ {
+ carry_max = wrapper::vpmax(carry_max, carry_max);
+ }
+ T max_val = wrapper::vgetlane(carry_max, 0);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
- }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
+ }
- *out_ptr = max_val;
- },
- input, output);
+ *out_ptr = max_val;
+ },
+ input, output);
}
template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
+void neon_softmax_logits_1d_quantized(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_softmax_logits_1d_float(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
const int start_x = in->info()->valid_region().anchor.x();
const int input_width = in->info()->valid_region().shape.x();
@@ -103,113 +116,118 @@ void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *c
constexpr int vec_size = 16 / sizeof(T);
const int sum_stages = log2(vec_size / 2);
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
- T sum{};
- T sum_inversed{};
-
- /* Compute exponentials and sum */
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
- /* Init sum to zero */
- auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ /* Get pointers */
+ const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+ const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+ const auto tmp_ptr = reinterpret_cast<T *>(tmp);
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vsub(vec_elements, vec_max);
- if(is_log)
- {
- vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
- vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
- }
- else
- {
- vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
- vec_sum = wrapper::vadd(vec_sum, vec_elements);
- }
- wrapper::vstore(tmp_ptr + x, vec_elements);
- }
+ T sum{};
+ T sum_inversed{};
- /* Reduce sum */
- auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
- for(int i = 0; i < sum_stages; ++i)
+ /* Compute exponentials and sum */
{
- sum_res = wrapper::vpadd(sum_res, sum_res);
- }
- sum = wrapper::vgetlane(sum_res, 0);
+ /* Get max value */
+ const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+ const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- T element{};
+ /* Init sum to zero */
+ auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
- if(is_log)
+ /* Loop over row and compute exponentials and sum */
+ int x = 0;
+ for (; x <= (input_width - vec_size); x += vec_size)
{
- element = (in_ptr[x] - max_val) * beta;
- sum += std::exp(element);
+ auto vec_elements = wrapper::vloadq(in_ptr + x);
+ vec_elements = wrapper::vsub(vec_elements, vec_max);
+ if (is_log)
+ {
+ vec_elements =
+ wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
+ vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+ }
+ else
+ {
+ vec_elements = wrapper::vexpq(
+ wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
+ vec_sum = wrapper::vadd(vec_sum, vec_elements);
+ }
+ wrapper::vstore(tmp_ptr + x, vec_elements);
}
- else
+
+ /* Reduce sum */
+ auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
+ for (int i = 0; i < sum_stages; ++i)
{
- element = std::exp((in_ptr[x] - max_val) * beta);
- sum += element;
+ sum_res = wrapper::vpadd(sum_res, sum_res);
}
- tmp_ptr[x] = element;
- }
+ sum = wrapper::vgetlane(sum_res, 0);
- if(!is_log)
- {
- sum_inversed = T(1) / sum;
- }
- else
- {
- sum = static_cast<T>(std::log(sum));
- }
- }
+ /* Run remaining elements */
+ for (; x < input_width; ++x)
+ {
+ T element{};
+
+ if (is_log)
+ {
+ element = (in_ptr[x] - max_val) * beta;
+ sum += std::exp(element);
+ }
+ else
+ {
+ element = std::exp((in_ptr[x] - max_val) * beta);
+ sum += element;
+ }
+ tmp_ptr[x] = element;
+ }
- /* Normalize exponentials */
- {
- /* Loop over row and compute softmax */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_in = wrapper::vloadq(tmp_ptr + x);
- auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
- if(is_log)
+ if (!is_log)
{
- normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+ sum_inversed = T(1) / sum;
}
else
{
- normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+ sum = static_cast<T>(std::log(sum));
}
- wrapper::vstore(out_ptr + x, normalized_value);
}
- /* Run remaining elements */
- for(; x < input_width; ++x)
+
+ /* Normalize exponentials */
{
- if(is_log)
+ /* Loop over row and compute softmax */
+ int x = 0;
+ for (; x <= (input_width - vec_size); x += vec_size)
{
- out_ptr[x] = tmp_ptr[x] - sum;
+ auto vec_in = wrapper::vloadq(tmp_ptr + x);
+ auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+ if (is_log)
+ {
+ normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+ }
+ else
+ {
+ normalized_value =
+ wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+ }
+ wrapper::vstore(out_ptr + x, normalized_value);
}
- else
+ /* Run remaining elements */
+ for (; x < input_width; ++x)
{
- out_ptr[x] = tmp_ptr[x] * sum_inversed;
+ if (is_log)
+ {
+ out_ptr[x] = tmp_ptr[x] - sum;
+ }
+ else
+ {
+ out_ptr[x] = tmp_ptr[x] * sum_inversed;
+ }
}
}
- }
- },
- in_it, max_it, out_it);
+ },
+ in_it, max_it, out_it);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index a572891561..40713dc496 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -22,14 +22,20 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
}
@@ -38,5 +44,5 @@ void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
{
return neon_logits_1d_max<qasymm8_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 7d3fe6e046..2c5e284f54 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -22,14 +22,20 @@
* SOFTWARE.
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_signed_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
}
@@ -38,5 +44,5 @@ void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &w
{
return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
index 15a523bfc9..5e94f72faf 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
@@ -23,14 +23,20 @@
*/
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/CpuTypes.h"
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp16_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
}
@@ -39,6 +45,6 @@ void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
{
return sve_logits_1d_max<float16_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
index 55c4aee426..d692cc2477 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
@@ -23,14 +23,20 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp32_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
}
@@ -39,5 +45,5 @@ void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
{
return sve_logits_1d_max<float>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 2340a31cbd..24f1bb8143 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -23,6 +23,7 @@
*/
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
+
#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
namespace arm_compute
@@ -36,42 +37,48 @@ void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input(in, win);
Iterator output(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Get pointers
- const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
- const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ // Get pointers
+ const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
+ const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- // Init max value
- auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
+ // Init max value
+ auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
- int x = window_start_x;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
- {
- const auto current_value = svld1(pg, in_ptr + x);
- vec_max = svmax_m(pg, vec_max, current_value);
+ int x = window_start_x;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ do
+ {
+ const auto current_value = svld1(pg, in_ptr + x);
+ vec_max = svmax_m(pg, vec_max, current_value);
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+ } while (svptest_any(all_true_pg, pg));
- auto max_val = svmaxv(all_true_pg, vec_max);
+ auto max_val = svmaxv(all_true_pg, vec_max);
- *out_ptr = max_val;
- },
- input, output);
+ *out_ptr = max_val;
+ },
+ input, output);
}
template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_softmax_logits_1d_float(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
const int start_x = in->info()->valid_region().anchor.x();
const int input_width = in->info()->valid_region().shape.x();
@@ -82,88 +89,88 @@ void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *co
const auto all_true_pg = wrapper::svptrue<ScalarType>();
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
- ScalarType sum{ 0 };
-
- /* Compute exponentials and sum */
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- /* Get max value */
- const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
- const auto vec_max = wrapper::svdup_n(max_val);
- const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+ /* Get pointers */
+ const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+ const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+ const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
- /* Init sum to zero */
- auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+ ScalarType sum{0};
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- do
+ /* Compute exponentials and sum */
{
- auto vec_elements = svld1(pg, in_ptr + x);
- vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
- if(!is_log)
+ /* Get max value */
+ const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+ const auto vec_max = wrapper::svdup_n(max_val);
+ const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+
+ /* Init sum to zero */
+ auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+
+ /* Loop over row and compute exponentials and sum */
+ int x = 0;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ do
{
- vec_elements = wrapper::svexp_z(pg, vec_elements);
- vec_sum = svadd_m(pg, vec_sum, vec_elements);
+ auto vec_elements = svld1(pg, in_ptr + x);
+ vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
+ if (!is_log)
+ {
+ vec_elements = wrapper::svexp_z(pg, vec_elements);
+ vec_sum = svadd_m(pg, vec_sum, vec_elements);
+ }
+ svst1(pg, tmp_ptr + x, vec_elements);
+
+ if (is_log)
+ {
+ vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+ }
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ } while (svptest_any(all_true_pg, pg));
+
+ /* Reduce sum */
+ sum = svaddv(all_true_pg, vec_sum);
+
+ if (is_log)
+ {
+ sum = static_cast<ScalarType>(std::log(sum));
}
- svst1(pg, tmp_ptr + x, vec_elements);
-
- if(is_log)
+ else
{
- vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+ sum = ScalarType(1) / sum;
}
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
}
- while(svptest_any(all_true_pg, pg));
- /* Reduce sum */
- sum = svaddv(all_true_pg, vec_sum);
-
- if(is_log)
- {
- sum = static_cast<ScalarType>(std::log(sum));
- }
- else
- {
- sum = ScalarType(1) / sum;
- }
- }
-
- /* Normalize exponentials */
- {
- /* Loop over row and compute softmax */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- do
+ /* Normalize exponentials */
{
- auto vec_in = svld1(pg, tmp_ptr + x);
- auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
- if(is_log)
- {
- normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
- }
- else
+ /* Loop over row and compute softmax */
+ int x = 0;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ do
{
- normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
- }
- svst1(pg, out_ptr + x, normalized_value);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ auto vec_in = svld1(pg, tmp_ptr + x);
+ auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
+ if (is_log)
+ {
+ normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+ }
+ else
+ {
+ normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+ }
+ svst1(pg, out_ptr + x, normalized_value);
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ } while (svptest_any(all_true_pg, pg));
}
- while(svptest_any(all_true_pg, pg));
- }
- },
- in_it, max_it, out_it);
+ },
+ in_it, max_it, out_it);
}
template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
@@ -171,9 +178,19 @@ template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, cons
template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
+template void sve_softmax_logits_1d_float<float>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window);
+template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h
index 4f76ec6a26..89a30d042f 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve/impl.h
@@ -33,8 +33,13 @@ template <typename ScalarType>
void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
+void sve_softmax_logits_1d_float(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
index e9044d5fc9..85e5ccfea1 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
namespace arm_compute
@@ -33,5 +34,5 @@ void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
{
return sve_logits_1d_max<qasymm8_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
index ab45ce598d..4be2e2eed6 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve/impl.h"
namespace arm_compute
@@ -33,5 +34,5 @@ void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &wi
{
return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 8f677c62d4..98b2f5117f 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -23,7 +23,9 @@
*/
#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
+
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -31,8 +33,8 @@ namespace arm_compute
namespace cpu
{
template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window)
+void sve2_softmax_logits_1d_quantized(
+ const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
{
const int start_x = in->info()->valid_region().anchor.x();
const int input_width = in->info()->valid_region().shape.x();
@@ -50,162 +52,173 @@ void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi
const int inc_2 = static_cast<int>(2 * svcntw());
const int inc_3 = static_cast<int>(3 * svcntw());
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ {
+ /* Get pointers */
+ const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+ const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+ const auto tmp_ptr = reinterpret_cast<float *>(tmp);
- float sum{};
+ float sum{};
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
- const auto vec_max = wrapper::svdup_n(max_val);
-
- /* Init sum to zero */
- auto vec_sum_0 = svdup_n_f32(0.f);
- auto vec_sum_1 = svdup_n_f32(0.f);
- auto vec_sum_2 = svdup_n_f32(0.f);
- auto vec_sum_3 = svdup_n_f32(0.f);
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- svbool_t pg_0 = svunpklo(svunpklo(pg));
- svbool_t pg_1 = svunpkhi(svunpklo(pg));
- svbool_t pg_2 = svunpklo(svunpkhi(pg));
- svbool_t pg_3 = svunpkhi(svunpkhi(pg));
- do
+ /* Compute exponentials and sum */
{
- const auto vec_elements = svld1(pg, in_ptr + x);
- const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+ /* Get max value */
+ const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+ const auto vec_max = wrapper::svdup_n(max_val);
+
+ /* Init sum to zero */
+ auto vec_sum_0 = svdup_n_f32(0.f);
+ auto vec_sum_1 = svdup_n_f32(0.f);
+ auto vec_sum_2 = svdup_n_f32(0.f);
+ auto vec_sum_3 = svdup_n_f32(0.f);
+
+ /* Loop over row and compute exponentials and sum */
+ int x = 0;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ svbool_t pg_0 = svunpklo(svunpklo(pg));
+ svbool_t pg_1 = svunpkhi(svunpklo(pg));
+ svbool_t pg_2 = svunpklo(svunpkhi(pg));
+ svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+ do
+ {
+ const auto vec_elements = svld1(pg, in_ptr + x);
+ const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+
+ auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
+ auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
+ auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
+ auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
- auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
- auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
- auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
- auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
+ if (is_log)
+ {
+ vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
+ vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
+ vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
+ vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
+ vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
+ vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
+ vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
+ vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+ }
+ else
+ {
+ vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
+ vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
+ vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
+ vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
+ vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
+ vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
+ vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
+ vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+ }
- if(is_log)
+ svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
+ svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
+ svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
+ svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
+
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ pg_0 = svunpklo(svunpklo(pg));
+ pg_1 = svunpkhi(svunpklo(pg));
+ pg_2 = svunpklo(svunpkhi(pg));
+ pg_3 = svunpkhi(svunpkhi(pg));
+ } while (svptest_any(all_true_pg, pg));
+
+ /* Reduce sum */
+ const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1),
+ svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
+ sum = svaddv_f32(all_true_pg, vec_sum);
+
+ /* Run remaining elements */
+ x = 0;
+ if (is_log)
{
- vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
- vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
- vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
- vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
- vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
- vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
- vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
- vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+ sum = std::log(sum);
}
else
{
- vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
- vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
- vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
- vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
- vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
- vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
- vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
- vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+ sum = 256.f / sum;
}
-
- svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
- svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
- svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
- svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- pg_0 = svunpklo(svunpklo(pg));
- pg_1 = svunpkhi(svunpklo(pg));
- pg_2 = svunpklo(svunpkhi(pg));
- pg_3 = svunpkhi(svunpkhi(pg));
}
- while(svptest_any(all_true_pg, pg));
- /* Reduce sum */
- const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
- sum = svaddv_f32(all_true_pg, vec_sum);
-
- /* Run remaining elements */
- x = 0;
- if(is_log)
- {
- sum = std::log(sum);
- }
- else
+ /* Normalize exponentials */
{
- sum = 256.f / sum;
- }
- }
-
- /* Normalize exponentials */
- {
- constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
- /* Loop over row and compute softmax */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- svbool_t pg_0 = svunpklo(svunpklo(pg));
- svbool_t pg_1 = svunpkhi(svunpklo(pg));
- svbool_t pg_2 = svunpklo(svunpkhi(pg));
- svbool_t pg_3 = svunpkhi(svunpkhi(pg));
- do
- {
- auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
- auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
- auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
- auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
-
- svfloat32_t res_0{};
- svfloat32_t res_1{};
- svfloat32_t res_2{};
- svfloat32_t res_3{};
-
- if(is_log)
+ constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
+ /* Loop over row and compute softmax */
+ int x = 0;
+ svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ svbool_t pg_0 = svunpklo(svunpklo(pg));
+ svbool_t pg_1 = svunpkhi(svunpklo(pg));
+ svbool_t pg_2 = svunpklo(svunpkhi(pg));
+ svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+ do
{
- res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
- res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
- res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
- res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
- }
- else
- {
- res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
- res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
- res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
- res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+ auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
+ auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
+ auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
+ auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
+
+ svfloat32_t res_0{};
+ svfloat32_t res_1{};
+ svfloat32_t res_2{};
+ svfloat32_t res_3{};
- if(is_qasymm8_signed)
+ if (is_log)
{
- const auto offset_vec = svdup_n_f32(128.f);
- res_0 = svsub_z(pg_0, res_0, offset_vec);
- res_1 = svsub_z(pg_1, res_1, offset_vec);
- res_2 = svsub_z(pg_2, res_2, offset_vec);
- res_3 = svsub_z(pg_3, res_3, offset_vec);
+ res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+ res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+ res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+ res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+ }
+ else
+ {
+ res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+ res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+ res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+ res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+
+ if (is_qasymm8_signed)
+ {
+ const auto offset_vec = svdup_n_f32(128.f);
+ res_0 = svsub_z(pg_0, res_0, offset_vec);
+ res_1 = svsub_z(pg_1, res_1, offset_vec);
+ res_2 = svsub_z(pg_2, res_2, offset_vec);
+ res_3 = svsub_z(pg_3, res_3, offset_vec);
+ }
}
- }
- // Store value
- const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
- svst1(pg, out_ptr + x, out);
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- pg_0 = svunpklo(svunpklo(pg));
- pg_1 = svunpkhi(svunpklo(pg));
- pg_2 = svunpklo(svunpkhi(pg));
- pg_3 = svunpkhi(svunpkhi(pg));
+ // Store value
+ const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
+ svst1(pg, out_ptr + x, out);
+ x += wrapper::svcnt<ScalarType>();
+ pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+ pg_0 = svunpklo(svunpklo(pg));
+ pg_1 = svunpkhi(svunpklo(pg));
+ pg_2 = svunpklo(svunpkhi(pg));
+ pg_3 = svunpkhi(svunpkhi(pg));
+ } while (svptest_any(all_true_pg, pg));
}
- while(svptest_any(all_true_pg, pg));
- }
- },
- in_it, max_it, out_it);
+ },
+ in_it, max_it, out_it);
}
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h
index abbcc15181..33fcc26cda 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.h
@@ -31,8 +31,13 @@ namespace arm_compute
namespace cpu
{
template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window);
+void sve2_softmax_logits_1d_quantized(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ float beta,
+ bool is_log,
+ const Window &window);
} // namespace cpu
} // namespace arm_compute
#endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
index 810035eb9c..95623786b3 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
@@ -23,16 +23,22 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve2_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
index 283b55e9ce..c20462fcef 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
@@ -23,16 +23,22 @@
*/
#include "arm_compute/core/Helpers.h"
+
#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sve2_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_signed_softmax(const ITensor *in,
+ const ITensor *max,
+ void *const tmp,
+ ITensor *out,
+ const float beta,
+ bool is_log,
+ const Window &window)
{
return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
}
-}
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index ed3515f417..627ce0c264 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h
@@ -28,9 +28,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_SOFTMAX_KERNEL(func_name) \
- void func_name(const ITensor *in, const ITensor *max, void *const tmp, \
- ITensor *out, const float beta, bool is_log, const Window &window)
+#define DECLARE_SOFTMAX_KERNEL(func_name) \
+ void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \
+ bool is_log, const Window &window)
DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
@@ -43,8 +43,7 @@ DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax);
#undef DECLARE_SOFTMAX_KERNEL
-#define DECLARE_LOGITS_KERNEL(func_name) \
- void func_name(const ITensor *in, ITensor *out, const Window &window)
+#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window)
DECLARE_LOGITS_KERNEL(neon_fp32_logits);
DECLARE_LOGITS_KERNEL(neon_fp16_logits);
diff --git a/src/cpu/kernels/sub/neon/list.h b/src/cpu/kernels/sub/neon/list.h
index f7e1a040bd..9f6c92271f 100644
--- a/src/cpu/kernels/sub/neon/list.h
+++ b/src/cpu/kernels/sub/neon/list.h
@@ -26,14 +26,16 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace cpu
{
-#define DECLARE_SUB_KERNEL(func_name) \
- void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+#define DECLARE_SUB_KERNEL(func_name) \
+ void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+ const Window &window)
DECLARE_SUB_KERNEL(sub_qasymm8_neon_fixedpoint);
DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon_fixedpoint);
@@ -44,7 +46,8 @@ DECLARE_SUB_KERNEL(sub_qsymm16_neon);
#undef DECLARE_SUB_KERNEL
template <typename T>
-void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_same_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
@@ -68,7 +71,7 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
Iterator output(dst, window);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -84,41 +87,44 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
Iterator output(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
- const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+ const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
+ const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
- auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
- if(is_broadcast_input_2)
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
+ const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+ auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v)
+ : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
+ if (is_broadcast_input_2)
+ {
+ res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
+ }
+ wrapper::vstore(output_ptr + x, res);
}
- wrapper::vstore(output_ptr + x, res);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
- auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
- if(is_broadcast_input_2)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- res = static_cast<T>(-1) * res;
+ const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+ auto res =
+ is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
+ if (is_broadcast_input_2)
+ {
+ res = static_cast<T>(-1) * res;
+ }
+
+ *(output_ptr + x) = res;
}
-
- *(output_ptr + x) = res;
- }
- },
- broadcast_input, non_broadcast_input, output);
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -131,31 +137,32 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
Iterator output(dst, win);
execute_window_loop(
- win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto val1 = wrapper::vloadq(input1_ptr + x);
- const auto val2 = wrapper::vloadq(input2_ptr + x);
- const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ win,
+ [&](const Coordinates &)
{
- const auto val1 = *(input1_ptr + x);
- const auto val2 = *(input2_ptr + x);
- *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
- }
- },
- input1, input2, output);
+ const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto val1 = wrapper::vloadq(input1_ptr + x);
+ const auto val2 = wrapper::vloadq(input2_ptr + x);
+ const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
+ wrapper::vstore(output_ptr + x, res);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto val1 = *(input1_ptr + x);
+ const auto val2 = *(input2_ptr + x);
+ *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
+ }
+ },
+ input1, input2, output);
}
}
} // namespace cpu
diff --git a/src/cpu/kernels/sub/neon/qasymm8.cpp b/src/cpu/kernels/sub/neon/qasymm8.cpp
index ea6e5826dd..b750afce6e 100644
--- a/src/cpu/kernels/sub/neon/qasymm8.cpp
+++ b/src/cpu/kernels/sub/neon/qasymm8.cpp
@@ -23,21 +23,24 @@
*/
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
+
#include "src/cpu/kernels/add/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sub_qasymm8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_neon_fixedpoint(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
add_sub_q8_neon_fixedpoint<uint8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
}
-void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
add_sub_qasymm8_neon(src0, src1, dst, policy, window, false /*is_addition*/);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
index a86c7f22f6..fb0bb62682 100644
--- a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
@@ -24,21 +24,24 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
+
#include "src/cpu/kernels/add/generic/neon/impl.h"
namespace arm_compute
{
namespace cpu
{
-void sub_qasymm8_signed_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_signed_neon_fixedpoint(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
add_sub_q8_neon_fixedpoint<int8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
}
-void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_signed_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, false /*is_addition*/);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp
index 4dfdc0e78c..23e4b03843 100644
--- a/src/cpu/kernels/sub/neon/qsymm16.cpp
+++ b/src/cpu/kernels/sub/neon/qsymm16.cpp
@@ -25,14 +25,16 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
namespace arm_compute
{
namespace cpu
{
-void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qsymm16_neon(
+ const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
{
ARM_COMPUTE_UNUSED(policy);
@@ -57,7 +59,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
@@ -65,7 +67,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+ const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
// Clear X Dimension on execution window as we handle manually
non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -74,61 +76,62 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
- const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
- const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+ const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+ const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
- const float32x4x2_t bf =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
- }
- };
- const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+ const float32x4x2_t bf = {{
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
+ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
+ }};
+ const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
- const float32x4x2_t af =
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
+ const float32x4x2_t af = {{
vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
- }
- };
+ }};
- const int32x4x4_t rf =
- {
- {
+ const int32x4x4_t rf = {{
#ifdef __aarch64__
- vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+ vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+ : vsubq_f32(af.val[0], bf.val[0]),
+ invvscaleo)),
+ vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+ : vsubq_f32(af.val[1], bf.val[1]),
+ invvscaleo)),
#else //__aarch64__
- vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+ vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+ : vsubq_f32(af.val[0], bf.val[0]),
+ invvscaleo)),
+ vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+ : vsubq_f32(af.val[1], bf.val[1]),
+ invvscaleo)),
#endif //__aarch64__
- }
- };
+ }};
- const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
- vst1q_s16(output_ptr + x, pa);
- }
+ const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+ vst1q_s16(output_ptr + x, pa);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
- *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
- }
- },
- broadcast_input, non_broadcast_input, output);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+ *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
+ }
+ },
+ broadcast_input, non_broadcast_input, output);
}
else
{
@@ -140,38 +143,32 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
Iterator input2(src1, input2_win);
Iterator output(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
{
- const int16x8_t a = vld1q_s16(input1_ptr + x);
- const int16x8_t b = vld1q_s16(input2_ptr + x);
+ const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
- const float32x4x2_t af =
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- {
+ const int16x8_t a = vld1q_s16(input1_ptr + x);
+ const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+ const float32x4x2_t af = {{
vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
- }
- };
+ }};
- const float32x4x2_t bf =
- {
- {
+ const float32x4x2_t bf = {{
vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
- }
- };
+ }};
- const int32x4x2_t rf =
- {
- {
+ const int32x4x2_t rf = {{
#ifdef __aarch64__
vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
@@ -179,23 +176,22 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
#endif //__aarch64__
- }
- };
+ }};
- const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
- vst1q_s16(output_ptr + x, pa);
- }
+ const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+ vst1q_s16(output_ptr + x, pa);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
- const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
- *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
- }
- },
- input1, input2, output);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+ const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+ *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
+ }
+ },
+ input1, input2, output);
}
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp
index 197e9850b9..44d70cf503 100644
--- a/src/cpu/operators/CpuActivation.cpp
+++ b/src/cpu/operators/CpuActivation.cpp
@@ -24,6 +24,7 @@
#include "src/cpu/operators/CpuActivation.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/IOperator.h"
#include "src/common/utils/LegacySupport.h"
#include "src/common/utils/Log.h"
@@ -42,7 +43,8 @@ void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, con
_kernel = std::move(k);
}
-Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
+Status
+CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
{
return kernels::CpuActivationKernel::validate(input, output, activation_info);
}
@@ -54,13 +56,17 @@ void CpuActivation::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
}
-std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src,
+ const AclTensorDescriptor &dst,
+ const AclActivationDescriptor &act,
+ bool is_validate)
{
TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
auto info = detail::convert_to_activation_info(act);
- if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+ if (is_validate &&
+ !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
{
return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
}
@@ -69,7 +75,7 @@ std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTenso
act_op->configure(&src_info, &dst_info, info);
auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
- if(op == nullptr)
+ if (op == nullptr)
{
ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
return std::make_tuple(nullptr, StatusCode::OutOfMemory);
diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h
index e21fc7d32c..ec442f92c8 100644
--- a/src/cpu/operators/CpuActivation.h
+++ b/src/cpu/operators/CpuActivation.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_ACTIVATION_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp
index 41def8e22f..53cd7fa1b7 100644
--- a/src/cpu/operators/CpuAdd.cpp
+++ b/src/cpu/operators/CpuAdd.cpp
@@ -23,17 +23,20 @@
*/
#include "src/cpu/operators/CpuAdd.h"
-#include "src/cpu/kernels/CpuAddKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
namespace arm_compute
{
namespace cpu
{
-void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAdd::configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info);
@@ -42,7 +45,11 @@ void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor
_kernel = std::move(k);
}
-Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAdd::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return kernels::CpuAddKernel::validate(src0, src1, dst, policy);
diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h
index db05c100cc..5f60102de2 100644
--- a/src/cpu/operators/CpuAdd.h
+++ b/src/cpu/operators/CpuAdd.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_ADD_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
@@ -55,14 +56,22 @@ public:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*
*/
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuAdd::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp
index 590ee482ca..2f19f2f842 100644
--- a/src/cpu/operators/CpuAddMulAdd.cpp
+++ b/src/cpu/operators/CpuAddMulAdd.cpp
@@ -21,39 +21,49 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "src/cpu/operators/CpuAddMulAdd.h"
+
#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/cpu/kernels/CpuAddMulAddKernel.h"
-#include "src/cpu/operators/CpuAddMulAdd.h"
#include "src/cpu/utils/CpuAuxTensorHandler.h"
namespace arm_compute
{
namespace cpu
{
-void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- ITensorInfo *add_output, ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAddMulAdd::configure(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ ITensorInfo *add_output,
+ ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
auto k = std::make_unique<kernels::CpuAddMulAddKernel>();
const DataType data_type = input1->data_type();
- if(is_data_type_quantized(data_type))
+ if (is_data_type_quantized(data_type))
{
_dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul);
_dequantize_bn_add.configure(bn_add, &_dequantized_bn_add);
- k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, act_info);
+ k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy,
+ act_info);
// Save auxilary memory requirements after configuration
- _aux_mem[DequantizedBnMul] = experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, _dequantized_bn_mul.total_size());
- _aux_mem[DequantizedBnAdd] = experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, _dequantized_bn_add.total_size());
+ _aux_mem[DequantizedBnMul] =
+ experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary,
+ _dequantized_bn_mul.total_size());
+ _aux_mem[DequantizedBnAdd] =
+ experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary,
+ _dequantized_bn_add.total_size());
}
else
{
@@ -63,13 +73,17 @@ void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input
_kernel = std::move(k);
}
-Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- const ITensorInfo *add_output, const ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAddMulAdd::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
const DataType data_type = input1->data_type();
- if(is_data_type_quantized(data_type))
+ if (is_data_type_quantized(data_type))
{
TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32);
TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32);
@@ -77,11 +91,13 @@ Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *inpu
ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul));
ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add));
- return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, add_output, final_output, policy, act_info);
+ return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add,
+ add_output, final_output, policy, act_info);
}
else
{
- return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+ return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy,
+ act_info);
}
}
@@ -89,37 +105,32 @@ void CpuAddMulAdd::run(ITensorPack &tensors)
{
const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type();
- if(is_data_type_quantized(data_type))
+ if (is_data_type_quantized(data_type))
{
const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2);
const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3);
- CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, true);
- CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, true);
+ CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors,
+ true);
+ CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors,
+ true);
- ITensorPack dequantize_mul_pack =
- {
- { TensorType::ACL_SRC_0, bn_mul },
- { TensorType::ACL_DST_0, dequantized_bn_mul_handler.get() }
- };
+ ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul},
+ {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}};
- ITensorPack dequantize_add_pack =
- {
- { TensorType::ACL_SRC_0, bn_add },
- { TensorType::ACL_DST_0, dequantized_bn_add_handler.get() }
- };
+ ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add},
+ {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}};
_dequantize_bn_mul.run(dequantize_mul_pack);
_dequantize_bn_add.run(dequantize_add_pack);
- ITensorPack add_mul_add_pack =
- {
- { TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0) },
- { TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1) },
- { TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get() },
- { TensorType::ACL_SRC_3, dequantized_bn_add_handler.get() },
- { TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0) },
- { TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1) },
+ ITensorPack add_mul_add_pack = {
+ {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)},
+ {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)},
+ {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()},
+ {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()},
+ {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)},
+ {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)},
};
NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack);
diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h
index cf1ece68f1..47db75c37e 100644
--- a/src/cpu/operators/CpuAddMulAdd.h
+++ b/src/cpu/operators/CpuAddMulAdd.h
@@ -42,20 +42,28 @@ public:
* Similar to @ref NEAddMulAdd::configure()
*
*/
- void configure(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- ITensorInfo *add_output, ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info);
+ void configure(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ ITensorInfo *add_output,
+ ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuAddMulAdd::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- const ITensorInfo *add_output, const ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
@@ -77,7 +85,7 @@ private:
TensorInfo _dequantized_bn_mul{};
TensorInfo _dequantized_bn_add{};
- experimental::MemoryRequirements _aux_mem{ Count };
+ experimental::MemoryRequirements _aux_mem{Count};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp
index 1cfd8c1d0e..55b9204d71 100644
--- a/src/cpu/operators/CpuCast.cpp
+++ b/src/cpu/operators/CpuCast.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuCast.h"
-#include "src/cpu/kernels/CpuCastKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
namespace arm_compute
{
diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp
index 4021fd8ded..5f517a8fcb 100644
--- a/src/cpu/operators/CpuConcatenate.cpp
+++ b/src/cpu/operators/CpuConcatenate.cpp
@@ -23,21 +23,20 @@
*/
#include "src/cpu/operators/CpuConcatenate.h"
-#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
-#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
-#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
-#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
#include "arm_compute/core/Error.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
+#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
+#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
+#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
namespace arm_compute
{
@@ -59,9 +58,9 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect
unsigned int offset = 0;
- for(unsigned int i = 0; i < _num_srcs; ++i)
+ for (unsigned int i = 0; i < _num_srcs; ++i)
{
- switch(axis)
+ switch (axis)
{
case Window::DimX:
{
@@ -98,16 +97,17 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect
}
}
-Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
+Status
+CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
unsigned int offset = 0;
- for(const auto &src : srcs_vector)
+ for (const auto &src : srcs_vector)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
- switch(axis)
+ switch (axis)
{
case Window::DimX:
{
@@ -135,7 +135,7 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec
offset += src->dimension(axis);
}
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
@@ -146,18 +146,18 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec
void CpuConcatenate::run(ITensorPack &tensors)
{
- if(tensors.empty())
+ if (tensors.empty())
{
ARM_COMPUTE_ERROR("No inputs provided");
}
- if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
+ if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
{
ARM_COMPUTE_ERROR("Configured with different number of inputs");
}
int i = 0;
- for(auto &k : _concat_kernels)
+ for (auto &k : _concat_kernels)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h
index eb11926b48..c36977c70f 100644
--- a/src/cpu/operators/CpuConcatenate.h
+++ b/src/cpu/operators/CpuConcatenate.h
@@ -68,8 +68,8 @@ public:
private:
std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{};
- unsigned int _num_srcs{ 0 };
- unsigned int _axis{ 0 };
+ unsigned int _num_srcs{0};
+ unsigned int _axis{0};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
index 16ac16b3ba..19311733db 100644
--- a/src/cpu/operators/CpuConv2d.cpp
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuConv2d.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/operators/CpuDirectConv2d.h"
#include "src/cpu/operators/CpuGemm.h"
@@ -35,26 +37,35 @@ namespace arm_compute
{
namespace cpu
{
-CpuConv2d::CpuConv2d()
- : _function()
+CpuConv2d::CpuConv2d() : _function()
{
}
CpuConv2d::~CpuConv2d() = default;
-void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuConv2d::configure(ITensorInfo *input,
+ ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_UNUSED(num_groups);
- ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
- enable_fast_math, num_groups));
+ ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation,
+ act_info, enable_fast_math, num_groups));
- ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math, num_groups);
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+ switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
{
@@ -92,19 +103,30 @@ void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITenso
_aux_mem = _function->workspace();
}
-Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuConv2d::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+ switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
- ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
break;
case ConvolutionMethod::GEMM:
- ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info,
+ dilation, act_info, enable_fast_math));
break;
case ConvolutionMethod::GEMM_CONV2D:
ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info));
@@ -120,9 +142,14 @@ Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights,
return Status{};
}
-ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
- const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
ARM_COMPUTE_UNUSED(weights_info);
@@ -137,35 +164,46 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
- const std::vector<ConfigurationMethod> known_configs =
- {
+ const std::vector<ConfigurationMethod> known_configs = {
// Alexnet
- ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+ PadStrideInfo(1U, 1U, 2U, 2U)),
+ ConvolutionMethod::GEMM),
// VGG16 / VGG19
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+ PadStrideInfo(1U, 1U, 1U, 1U)),
+ ConvolutionMethod::GEMM),
// Mobilenet 224
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
+ ConfigurationMethod(
+ ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+ PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+ ConvolutionMethod::GEMM),
// Mobilenet 160
- ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
- };
+ ConfigurationMethod(
+ ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+ PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+ ConvolutionMethod::GEMM)};
const auto find_config = [&](ConfigurationMethod c)
{
const ConvolutionConfiguration config = c.first;
const PadStrideInfo info = std::get<3>(config);
- return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
- && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+ return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+ std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+ std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+ info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+ info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+ info.stride() == conv_info.stride();
};
std::vector<ConfigurationMethod>::const_iterator found;
- if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+ if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
{
return (*found).second;
}
- if(dilation != Size2D(1U, 1U))
+ if (dilation != Size2D(1U, 1U))
{
return ConvolutionMethod::GEMM;
}
@@ -173,43 +211,49 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
{
// SRGAN
// Output might not be initialized when it is an internal tensor of the layer using the convolution
- if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
- && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
+ if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) &&
+ (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
{
return ConvolutionMethod::DIRECT;
}
- if(input->dimension(idx_c) < 16)
+ if (input->dimension(idx_c) < 16)
{
return ConvolutionMethod::GEMM;
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// This heuristics only applies to F16 data type on A55r1
- if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
+ if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math &&
+ input->data_type() == DataType::F16)
{
// Exclude known bad winograd configs (and defaults to GEMM)
- const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
- {
+ const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = {
// Squeezenet_V1_1 fire2 and fire3
- ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
+ ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U),
+ PadStrideInfo(1U, 1U, 1U, 1U)),
// Squeezenet_V1_1 fire6 and fire7
- ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
+ ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U),
+ PadStrideInfo(1U, 1U, 1U, 1U)),
// Squeezenet_V1_1 fire8 and fire9
- ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
+ ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U),
+ PadStrideInfo(1U, 1U, 1U, 1U)),
};
const auto find_conv_config = [&](ConvolutionConfiguration c)
{
const PadStrideInfo info = std::get<3>(c);
- return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
- && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+ return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+ std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+ std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+ info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+ info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+ info.stride() == conv_info.stride();
};
- bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
- find_conv_config)
- != known_bad_winograd_f16_with_fastmath_configs.end();
- if(found_bad)
+ bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(),
+ known_bad_winograd_f16_with_fastmath_configs.end(),
+ find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end();
+ if (found_bad)
{
return ConvolutionMethod::GEMM;
}
@@ -217,16 +261,16 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// For 1x1 convolutions run the default GEMM
- if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+ if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
{
return ConvolutionMethod::GEMM;
}
- if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+ if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
{
return ConvolutionMethod::WINOGRAD;
}
- if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
+ if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
{
return ConvolutionMethod::GEMM_CONV2D;
}
diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h
index 0908ac0cbb..71b9e15dc1 100644
--- a/src/cpu/operators/CpuConv2d.h
+++ b/src/cpu/operators/CpuConv2d.h
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -102,17 +103,32 @@ public:
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
*/
- void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+ void configure(ITensorInfo *src,
+ ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d
*
* Similar to CpuConv2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
- unsigned int num_groups = 1);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will return the convolution called by @ref CpuConv2d
*
* @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -132,11 +148,17 @@ public:
*
* @return the Convolution Method Hint
*/
- static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ static ConvolutionMethod get_convolution_method(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
index 810ffb1e4e..49e31926e3 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
@@ -24,6 +24,7 @@
#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
@@ -31,7 +32,10 @@ namespace arm_compute
{
namespace cpu
{
-void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout)
{
ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
@@ -39,7 +43,10 @@ void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorI
_kernel = std::move(k);
}
-Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout)
{
return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
}
@@ -48,5 +55,5 @@ void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors)
{
NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
}
-} // namesapce cpu
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
index ea70eee134..e208cca3a0 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.h
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
@@ -41,14 +41,18 @@ public:
* @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
* @param[in] data_layout The data layout the weights have been trained in.
*/
- void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+ void
+ configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuConvertFullyConnectedWeights::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
};
diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp
index 7420ff6240..92c19d4df2 100644
--- a/src/cpu/operators/CpuCopy.cpp
+++ b/src/cpu/operators/CpuCopy.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuCopy.h"
-#include "src/cpu/kernels/CpuCopyKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCopyKernel.h"
namespace arm_compute
{
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp
index 884fe5c4ed..54075f2afa 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -24,10 +24,11 @@
#include "src/cpu/operators/CpuDepthwiseConv2d.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
@@ -37,11 +38,16 @@ namespace cpu
{
namespace
{
-Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status validate_arguments_optimized(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- if(!is_data_type_quantized_per_channel(weights->data_type()))
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ if (!is_data_type_quantized_per_channel(weights->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
}
@@ -49,14 +55,17 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w
ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
- info.pad_stride_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
- info.pad_stride_info.pad_bottom());
-
- if(biases != nullptr)
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) >
+ src->dimension(idx_w) + info.pad_stride_info.pad_left() +
+ info.pad_stride_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) >
+ src->dimension(idx_h) + info.pad_stride_info.pad_top() +
+ info.pad_stride_info.pad_bottom());
+
+ if (biases != nullptr)
{
- const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+ const unsigned int channel_idx =
+ get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
}
@@ -64,7 +73,7 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w
ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
// Validate Activation Layer
- if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+ if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
}
@@ -80,8 +89,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
- dst, info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
_is_quantized = is_data_type_quantized_asymmetric(src->data_type());
_has_bias = biases != nullptr;
@@ -91,10 +100,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
_are_weights_const = weights->are_values_constant();
// Configure pipeline
- _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
+ _is_activationlayer_enabled =
+ info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
_dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
- if(_is_nchw)
+ if (_is_nchw)
{
_permute_input = std::make_unique<cpu::CpuPermute>();
_permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -128,7 +138,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
}
// Configure activation
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activationlayer_function = std::make_unique<cpu::CpuActivation>();
_activationlayer_function->configure(dst, nullptr, info.act_info);
@@ -155,7 +165,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
// Permute input
- if(_permute)
+ if (_permute)
{
ITensorPack pack;
auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -166,7 +176,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
}
// Run assembly function
- if(_is_nchw)
+ if (_is_nchw)
{
auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -198,7 +208,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
}
// Permute output
- if(_is_nchw)
+ if (_is_nchw)
{
ITensorPack pack;
auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -208,7 +218,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
}
// Run activation
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -221,7 +231,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
{
// if weights are not constant then we need to repack so that weights
// can be updated in-place
- if(!_are_weights_const)
+ if (!_are_weights_const)
{
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -238,14 +248,14 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
return;
}
- if(!_is_prepared)
+ if (!_is_prepared)
{
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
// Permute weights
- if(_permute)
+ if (_permute)
{
auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -279,11 +289,15 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
}
}
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
- dst, info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
_is_nchw = src->data_layout() == DataLayout::NCHW;
_is_prepared = !_is_nchw;
@@ -294,9 +308,10 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
auto input_perm = std::make_unique<TensorInfo>();
auto weights_perm = std::make_unique<TensorInfo>();
- auto output_perm = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+ auto output_perm = std::make_unique<TensorInfo>(
+ dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
- if(_is_nchw)
+ if (_is_nchw)
{
_permute_input = std::make_unique<cpu::CpuPermute>();
_permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -315,7 +330,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
_depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
_depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
- if(_is_nchw)
+ if (_is_nchw)
{
_permute_output = std::make_unique<cpu::CpuPermute>();
_permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
@@ -324,43 +339,61 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
//Configure Activation Layer
_is_activationlayer_enabled = info.act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activationlayer_function = std::make_unique<cpu::CpuActivation>();
_activationlayer_function->configure(dst, nullptr, info.act_info);
}
}
-Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
const ConvolutionInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- if(src->data_layout() == DataLayout::NCHW)
+ if (src->data_layout() == DataLayout::NCHW)
{
TensorShape permuted_input_shape = src->tensor_shape();
TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+ TensorShape permuted_output_shape =
+ misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
- const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
- const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
- const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+ const TensorInfo permuted_input = TensorInfo(src->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_input_shape)
+ .set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_weights = TensorInfo(weights->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_weights_shape)
+ .set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_output = TensorInfo(dst->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_output_shape)
+ .set_data_layout(DataLayout::NCHW));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(
+ &permuted_input, &permuted_weights, biases, &permuted_output, info));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
}
// Validate Activation Layer
- if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+ if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
}
@@ -375,7 +408,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
- if(_is_nchw)
+ if (_is_nchw)
{
prepare(tensors);
auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
@@ -392,7 +425,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
- NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+ NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+ pack_depth);
}
else
{
@@ -401,10 +435,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
pack_depth.add_tensor(TensorType::ACL_DST, dst);
- NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+ NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+ pack_depth);
}
- if(_is_nchw)
+ if (_is_nchw)
{
ITensorPack pack;
auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -413,7 +448,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
_permute_output->run(pack);
}
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -424,7 +459,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -441,12 +476,17 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors
}
}
-void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
- _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
- switch(_depth_conv_func)
+ _depth_conv_func =
+ get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
+ switch (_depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_func_optimized.configure(src, weights, biases, dst, info);
@@ -459,10 +499,14 @@ void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights,
}
}
-Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
- switch(depth_conv_func)
+ switch (depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
@@ -475,10 +519,13 @@ Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *w
}
}
-DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
const ConvolutionInfo &info)
{
- if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
+ if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
{
return DepthwiseConvolutionFunction::OPTIMIZED;
}
@@ -490,7 +537,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_functi
void CpuDepthwiseConv2d::run(ITensorPack &tensors)
{
- switch(_depth_conv_func)
+ switch (_depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_func_optimized.run(tensors);
@@ -505,7 +552,7 @@ void CpuDepthwiseConv2d::run(ITensorPack &tensors)
void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
{
- switch(_depth_conv_func)
+ switch (_depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_func_optimized.prepare(tensors);
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h
index 3d8719ee44..7eaa0df857 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.h
+++ b/src/cpu/operators/CpuDepthwiseConv2d.h
@@ -24,8 +24,9 @@
#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
-#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorInfo.h"
+
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
@@ -56,14 +57,22 @@ public:
* Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
* @param[in] info Depthwise convolution meta-data.
*/
- void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+ void configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDepthwiseConv2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
*
* @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -76,7 +85,10 @@ public:
*
* @return a Depthwise Convolution Function
*/
- static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
const ConvolutionInfo &info);
// Inherited methods overriden:
@@ -118,32 +130,40 @@ private:
* @param[out] dst Destination tensor info. Data type supported: same as @p src.
* @param[in] info Depthwise convolution meta-data.
*/
- void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+ void configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
// Inherited methods overriden:
void run(ITensorPack &tensors) override;
void prepare(ITensorPack &tensors) override;
private:
- std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
- std::unique_ptr<CpuPermute> _permute_input{ nullptr };
- std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
- std::unique_ptr<CpuPermute> _permute_output{ nullptr };
- std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
- bool _has_bias{ false };
- bool _is_quantized{ false };
- bool _is_nchw{ true };
- bool _permute{ false };
- bool _is_activationlayer_enabled{ false };
- bool _is_prepared{ false };
- bool _are_weights_const{ true };
+ std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr};
+ std::unique_ptr<CpuPermute> _permute_input{nullptr};
+ std::unique_ptr<CpuPermute> _permute_weights{nullptr};
+ std::unique_ptr<CpuPermute> _permute_output{nullptr};
+ std::unique_ptr<CpuActivation> _activationlayer_function{nullptr};
+ bool _has_bias{false};
+ bool _is_quantized{false};
+ bool _is_nchw{true};
+ bool _permute{false};
+ bool _is_activationlayer_enabled{false};
+ bool _is_prepared{false};
+ bool _are_weights_const{true};
};
/** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
@@ -176,7 +196,11 @@ private:
* Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
* @param[in] info Depthwise convolution meta-data.
*/
- void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+ void configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
@@ -184,24 +208,28 @@ private:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
void prepare(ITensorPack &tensors) override;
private:
- std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
- std::unique_ptr<CpuPermute> _permute_input{ nullptr };
- std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
- std::unique_ptr<CpuPermute> _permute_output{ nullptr };
- std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
- bool _is_nchw{ true };
- bool _is_prepared{ false };
- bool _is_activationlayer_enabled{ false };
+ std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr};
+ std::unique_ptr<CpuPermute> _permute_input{nullptr};
+ std::unique_ptr<CpuPermute> _permute_weights{nullptr};
+ std::unique_ptr<CpuPermute> _permute_output{nullptr};
+ std::unique_ptr<CpuActivation> _activationlayer_function{nullptr};
+ bool _is_nchw{true};
+ bool _is_prepared{false};
+ bool _is_activationlayer_enabled{false};
};
- DepthwiseConvolutionFunction _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC };
+ DepthwiseConvolutionFunction _depth_conv_func{DepthwiseConvolutionFunction::GENERIC};
CpuDepthwiseConv2dOptimizedInternal _func_optimized{};
CpuDepthwiseConv2dGeneric _func_generic{};
};
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index d078155155..8d3741de96 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
@@ -38,15 +39,14 @@ namespace cpu
{
struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
{
- std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr };
- bool is_prepared{ false };
- bool are_weights_const{ true };
+ std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr};
+ bool is_prepared{false};
+ bool are_weights_const{true};
experimental::MemoryRequirements mem_req{};
};
#ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
- : _pImpl(std::make_unique<LocalImpl>())
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>())
{
}
#endif /* DOXYGEN_SKIP_THIS */
@@ -66,7 +66,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
_pImpl->are_weights_const = weights->are_values_constant();
// If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
- if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
+ if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
{
return;
}
@@ -77,12 +77,16 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
// Compute memory requirements for assembly kernels
constexpr size_t alignment = 4096;
- _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment });
- _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment });
+ _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment});
+ _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment});
_pImpl->asm_kernel = std::move(dwc_wrapper);
}
-Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
}
@@ -111,7 +115,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
{
const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- if((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
+ if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
{
// Pack weights and bias
const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -125,11 +129,12 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
const auto weights_padding = weights->info()->padding();
const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right;
- const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
+ const size_t ld_weights_row =
+ ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
_pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row);
weights->mark_as_unused();
- if(bias != nullptr)
+ if (bias != nullptr)
{
bias->mark_as_unused();
}
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
index f222ab9cf9..f1816625d2 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -53,14 +54,22 @@ public:
* @param[out] dst Destination tensor info. Data type supported: same as @p src.
* @param[in] info Depthwise convolution meta-data.
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Checks if activation is supported by the assembly kernels
*
* @param[in] activation Activation to check
@@ -70,8 +79,8 @@ public:
static bool is_activation_supported(const ActivationLayerInfo &activation);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp
index 12dc136ba3..c05a23f3a7 100644
--- a/src/cpu/operators/CpuDequantize.cpp
+++ b/src/cpu/operators/CpuDequantize.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuDequantizeKernel.h"
diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp
index 9cdbdb61c1..135a3bb2b9 100644
--- a/src/cpu/operators/CpuDirectConv2d.cpp
+++ b/src/cpu/operators/CpuDirectConv2d.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
namespace arm_compute
@@ -36,12 +37,25 @@ namespace cpu
CpuDirectConv2d::~CpuDirectConv2d() = default;
CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
- _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+ : _memory_group(std::move(memory_manager)),
+ _output_stage_kernel(),
+ _conv_kernel(),
+ _input_border_handler(),
+ _activationlayer_function(),
+ _accumulator(),
+ _has_bias(false),
+ _is_activationlayer_enabled(false),
+ _dim_split(Window::DimZ),
+ _is_padding_required()
{
}
-void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CpuDirectConv2d::configure(ITensorInfo *src,
+ ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info);
@@ -51,7 +65,7 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT
_input_border_handler = std::make_unique<NEFillBorderKernel>();
// Free accumulator
- if(_accumulator.buffer() != nullptr)
+ if (_accumulator.buffer() != nullptr)
{
_accumulator.allocator()->free();
}
@@ -62,28 +76,33 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT
_has_bias = (bias != nullptr);
_conv_kernel->configure(src, weights, dst, conv_info);
- if(_has_bias)
+ if (_has_bias)
{
_output_stage_kernel->configure(dst, bias);
}
_is_padding_required = !_conv_kernel->border_size().empty();
- if(_is_padding_required)
+ if (_is_padding_required)
{
// Add zero padding XY
- _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+ _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT,
+ PixelValue(static_cast<float>(0.f)));
}
//Configure Activation Layer
_is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activationlayer_function = std::make_unique<CpuActivation>();
_activationlayer_function->configure(dst, dst, act_info);
}
}
-Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+Status CpuDirectConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
@@ -95,7 +114,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig
// Validate Convolution kernel
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
- if(bias != nullptr)
+ if (bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
@@ -106,7 +125,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig
// Validate bias kernel
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
- if(act_info.enabled())
+ if (act_info.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
}
@@ -122,14 +141,15 @@ void CpuDirectConv2d::run(ITensorPack &tensors)
auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- if(_is_padding_required)
+ if (_is_padding_required)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_DST, src);
- NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
+ NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(),
+ pack);
}
NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
- if(_has_bias)
+ if (_has_bias)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_0, dst);
@@ -138,7 +158,7 @@ void CpuDirectConv2d::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
}
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, dst);
diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h
index fa8d61e083..73c85f2dcd 100644
--- a/src/cpu/operators/CpuDirectConv2d.h
+++ b/src/cpu/operators/CpuDirectConv2d.h
@@ -24,13 +24,14 @@
#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/Tensor.h"
+
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
@@ -75,14 +76,23 @@ public:
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensorInfo *src,
+ ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDirectConv2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
@@ -95,10 +105,10 @@ private:
std::unique_ptr<NEFillBorderKernel> _input_border_handler;
std::unique_ptr<CpuActivation> _activationlayer_function;
Tensor _accumulator;
- bool _has_bias{ false };
- bool _is_activationlayer_enabled{ false };
- unsigned int _dim_split{ 0 };
- bool _is_padding_required{ false };
+ bool _has_bias{false};
+ bool _is_activationlayer_enabled{false};
+ unsigned int _dim_split{0};
+ bool _is_padding_required{false};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp
index aa74e420a6..626f1c6775 100644
--- a/src/cpu/operators/CpuDirectConv3d.cpp
+++ b/src/cpu/operators/CpuDirectConv3d.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
namespace arm_compute
@@ -36,11 +37,17 @@ namespace cpu
CpuDirectConv3d::~CpuDirectConv3d() = default;
CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _conv_kernel(), _activationlayer_function(), _accumulator(), _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
+ : _memory_group(std::move(memory_manager)),
+ _conv_kernel(),
+ _activationlayer_function(),
+ _accumulator(),
+ _is_activationlayer_enabled(false),
+ _dim_split(Window::DimZ)
{
}
-void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
+void CpuDirectConv3d::configure(
+ ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
{
ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info);
ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
@@ -48,7 +55,7 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen
_conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>();
// Free accumulator
- if(_accumulator.buffer() != nullptr)
+ if (_accumulator.buffer() != nullptr)
{
_accumulator.allocator()->free();
}
@@ -59,21 +66,25 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen
//Configure Activation Layer
_is_activationlayer_enabled = conv_info.act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activationlayer_function = std::make_unique<CpuActivation>();
_activationlayer_function->configure(dst, dst, conv_info.act_info);
}
}
-Status CpuDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info)
+Status CpuDirectConv3d::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
// Validate Convolution kernel
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info));
- if(conv_info.act_info.enabled())
+ if (conv_info.act_info.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info));
}
@@ -89,7 +100,7 @@ void CpuDirectConv3d::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -98,4 +109,4 @@ void CpuDirectConv3d::run(ITensorPack &tensors)
}
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h
index cde01f07c2..3ad1e09a14 100644
--- a/src/cpu/operators/CpuDirectConv3d.h
+++ b/src/cpu/operators/CpuDirectConv3d.h
@@ -24,14 +24,15 @@
#ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H
#define ARM_COMPUTE_CPU_DIRECTCONV3D_H
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/Tensor.h"
+
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
@@ -76,14 +77,19 @@ public:
* The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor.
* @param[in] conv_info Contains padding, stride, acitvation information.
*/
- void configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
+ void configure(
+ ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDirectConv3d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info);
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo conv_info);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
@@ -93,8 +99,8 @@ private:
std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel;
std::unique_ptr<CpuActivation> _activationlayer_function;
Tensor _accumulator;
- bool _is_activationlayer_enabled{ false };
- unsigned int _dim_split{ 0 };
+ bool _is_activationlayer_enabled{false};
+ unsigned int _dim_split{0};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp
index b88ae3e514..c2ae8773c6 100644
--- a/src/cpu/operators/CpuElementwise.cpp
+++ b/src/cpu/operators/CpuElementwise.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuElementwise.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/CpuElementwiseKernel.h"
@@ -33,7 +34,7 @@ namespace cpu
void CpuElementwiseBase::run(ITensorPack &tensors)
{
// If the kernel has been configured, use the window from the kernel.
- if(_kernel->is_window_configured())
+ if (_kernel->is_window_configured())
{
ICpuOperator::run(tensors);
return;
@@ -101,12 +102,16 @@ void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, con
}
template <ComparisonOperation COP>
-Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status
+CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
{
return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
}
-void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op)
+void CpuElementwiseComparison::configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ComparisonOperation op)
{
ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
auto k = std::make_unique<kernels::CpuComparisonKernel>();
@@ -114,7 +119,10 @@ void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorI
_kernel = std::move(k);
}
-Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op)
+Status CpuElementwiseComparison::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ComparisonOperation op)
{
return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
}
@@ -127,4 +135,4 @@ template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>
template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h
index b6c61cf245..5db53c8026 100644
--- a/src/cpu/operators/CpuElementwise.h
+++ b/src/cpu/operators/CpuElementwise.h
@@ -139,7 +139,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
+ static Status
+ validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
};
/** Basic function to run @ref cpu::kernels::CpuComparisonKernel
@@ -182,4 +183,4 @@ using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqua
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp
index 7fd14dba7d..04ab7bf8f5 100644
--- a/src/cpu/operators/CpuElementwiseUnary.cpp
+++ b/src/cpu/operators/CpuElementwiseUnary.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuElementwiseUnary.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
@@ -47,7 +48,7 @@ Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src
void CpuElementwiseUnary::run(ITensorPack &tensors)
{
- if(_kernel->is_window_configured())
+ if (_kernel->is_window_configured())
{
ICpuOperator::run(tensors);
return;
@@ -57,4 +58,4 @@ void CpuElementwiseUnary::run(ITensorPack &tensors)
ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h
index 5e8e98d047..1e51bfaa1c 100644
--- a/src/cpu/operators/CpuElementwiseUnary.h
+++ b/src/cpu/operators/CpuElementwiseUnary.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
#include "arm_compute/core/Types.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
@@ -56,4 +57,4 @@ public:
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp
index 3d8f62fe07..1890d0b916 100644
--- a/src/cpu/operators/CpuFill.cpp
+++ b/src/cpu/operators/CpuFill.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuFill.h"
-#include "src/cpu/kernels/CpuFillKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFillKernel.h"
namespace arm_compute
{
diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h
index 41d9a9fa8a..cb83745d29 100644
--- a/src/cpu/operators/CpuFill.h
+++ b/src/cpu/operators/CpuFill.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_FILL_H
#include "arm_compute/core/PixelValue.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp
index 7bab9e481c..2609d44590 100644
--- a/src/cpu/operators/CpuFlatten.cpp
+++ b/src/cpu/operators/CpuFlatten.cpp
@@ -23,16 +23,14 @@
*/
#include "src/cpu/operators/CpuFlatten.h"
-#include "src/cpu/operators/CpuReshape.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuReshape.h"
namespace arm_compute
{
namespace cpu
{
-CpuFlatten::CpuFlatten()
- : _reshape(nullptr)
+CpuFlatten::CpuFlatten() : _reshape(nullptr)
{
}
diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp
index 868add7d29..a107393b01 100644
--- a/src/cpu/operators/CpuFloor.cpp
+++ b/src/cpu/operators/CpuFloor.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuFloor.h"
-#include "src/cpu/kernels/CpuFloorKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
namespace arm_compute
{
diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp
index 395d8d2aa5..85a0b0311b 100644
--- a/src/cpu/operators/CpuFullyConnected.cpp
+++ b/src/cpu/operators/CpuFullyConnected.cpp
@@ -25,10 +25,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/MemoryHelpers.h"
@@ -49,8 +50,11 @@ using namespace arm_compute::misc::shape_calculator;
namespace
{
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
- GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act,
+ GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
{
const auto data_type = src->data_type();
const QuantizationInfo oq_info = dst->quantization_info();
@@ -62,10 +66,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
int32_t output_multiplier;
int32_t output_shift;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
- int32_t type_min = 0;
- int32_t type_max = 0;
+ int32_t type_min = 0;
+ int32_t type_max = 0;
std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -78,14 +83,22 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
return Status{};
}
-Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act, bool enable_fast_math, WeightFormat weight_format)
+Status validate_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act,
+ bool enable_fast_math,
+ WeightFormat weight_format)
{
- if(is_data_type_quantized_asymmetric(src->data_type()))
+ if (is_data_type_quantized_asymmetric(src->data_type()))
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate src and weights offset
- const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
- const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+ const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+ -src->quantization_info().uniform().offset);
+ const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+ -weights->quantization_info().uniform().offset);
GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info));
@@ -97,11 +110,8 @@ Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITe
// Validate gemmlowp function
TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
- ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info,
- &weights_info,
- biases,
- dst,
- gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info));
}
else
{
@@ -142,21 +152,28 @@ CpuFullyConnected::CpuFullyConnected()
CpuFullyConnected::~CpuFullyConnected() = default;
-void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act)
{
- if(_is_quantized_asymmetric)
+ if (_is_quantized_asymmetric)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate src and weights offset
- const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
- const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+ const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+ -src->quantization_info().uniform().offset);
+ const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+ -weights->quantization_info().uniform().offset);
TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
// Configure gemmlowp function and output stage for asymmetric quantized types
GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
- const Status status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
+ const Status status =
+ get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
GEMMInfo gemm_info;
@@ -179,7 +196,11 @@ void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *
}
}
-void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act)
{
ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
@@ -195,7 +216,11 @@ void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorI
configure_mm(&_flattened_src, weights, biases, dst, act);
}
-void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act)
{
ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
@@ -203,17 +228,17 @@ void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInf
configure_mm(src, weights, biases, dst, act);
}
-void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+void CpuFullyConnected::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info,
+ const WeightsInfo &weights_info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src,
- weights,
- biases != nullptr ? biases : nullptr,
- dst,
- fc_info,
- weights_info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info));
ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
_needs_weights_conversion = false;
@@ -238,9 +263,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
// Check if we have a fully connected layer with batches
const bool is_batched_fc_layer = dst->dimension(1) > 1;
- if(is_batched_fc_layer)
+ if (is_batched_fc_layer)
{
- _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+ dst->tensor_shape().cbegin() + 1));
}
else
{
@@ -248,7 +275,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
}
// Reshape weights if needed
- if(_needs_weights_reshape)
+ if (_needs_weights_reshape)
{
// Reshape the weights
_transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
@@ -260,13 +287,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
}
// Convert weights if needed
- if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+ if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
{
// Convert weights
_convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>();
- _convert_weights->configure(weights_to_use,
- &_converted_weights,
- src->tensor_shape(),
+ _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(),
fc_info.weights_trained_layout);
_converted_weights.set_are_values_constant(weights_to_use->are_values_constant());
@@ -275,7 +300,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
_trans_weights_idx = AuxTensorIdx::ConvertedWeights;
}
- if(_is_fc_after_conv)
+ if (_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
@@ -287,54 +312,57 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
}
// Retain the tensorinfo with the weights to use
- if(_needs_weights_reshape || _needs_weights_conversion)
+ if (_needs_weights_reshape || _needs_weights_conversion)
{
_trans_weights = *weights_to_use;
}
// Set auxiliary memory requirements
auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
- for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+ for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
{
_aux_mem[i] = gemm_mem_req[i];
}
- if(_aux_mem[Pretranspose].size > 0)
+ if (_aux_mem[Pretranspose].size > 0)
{
// Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch
// Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation
// Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time.
_aux_mem[TransposedWeights] = MemoryInfo(
offset_int_vec(TransposedWeights),
- _dynamic_weights ? MemoryLifetime::Temporary :
- (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent :
- MemoryLifetime::Prepare,
+ _dynamic_weights ? MemoryLifetime::Temporary
+ : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent
+ : MemoryLifetime::Prepare,
_reshaped_weights.total_size());
- _aux_mem[ConvertedWeights] = MemoryInfo(
- offset_int_vec(ConvertedWeights),
- _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
- _converted_weights.total_size());
+ _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+ _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+ _converted_weights.total_size());
}
else
{
- _aux_mem[TransposedWeights] = MemoryInfo(
- offset_int_vec(TransposedWeights),
- _dynamic_weights ? MemoryLifetime::Temporary :
- _needs_weights_conversion ? MemoryLifetime::Prepare :
- MemoryLifetime::Persistent,
- _reshaped_weights.total_size());
+ _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+ _dynamic_weights ? MemoryLifetime::Temporary
+ : _needs_weights_conversion ? MemoryLifetime::Prepare
+ : MemoryLifetime::Persistent,
+ _reshaped_weights.total_size());
_aux_mem[ConvertedWeights] = MemoryInfo(
- offset_int_vec(ConvertedWeights),
- _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
+ offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
_converted_weights.total_size());
}
- _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+ _aux_mem[FlattenedSrc] =
+ MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
}
-Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
- const ITensorInfo *biases, const ITensorInfo *dst, FullyConnectedLayerInfo fc_info, WeightsInfo weights_info)
+Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info,
+ WeightsInfo weights_info)
{
GEMMInfo gemm_info;
gemm_info.set_activation_info(fc_info.activation_info);
@@ -345,12 +373,17 @@ Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weigh
return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
}
-Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+Status CpuFullyConnected::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
if (is_fixed_format_fast_math(weights_info.weight_format()))
{
@@ -364,15 +397,22 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
}
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
- && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+ fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+ fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+ fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
bool is_fc_after_conv = true;
- const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
- const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
- const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+ const ITensorInfo &flatten_src =
+ TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
+ const ITensorInfo &reshaped_weights = TensorInfo(
+ weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+ const ITensorInfo &converted_weights = weights_reshaped
+ ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+ : TensorInfo(*reshaped_weights.clone());
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
@@ -386,10 +426,10 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
// Check if we have a fully connected layer with batches
const bool is_batched_fc_layer = dst->dimension(1) > 1;
- if(biases != nullptr)
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- if(is_data_type_quantized(src->data_type()))
+ if (is_data_type_quantized(src->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
@@ -399,36 +439,37 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
}
}
- if(is_batched_fc_layer)
+ if (is_batched_fc_layer)
{
- is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+ is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+ dst->tensor_shape().cbegin() + 1));
}
else
{
is_fc_after_conv = src->num_dimensions() > 1;
}
- if(!weights_reshaped)
+ if (!weights_reshaped)
{
// Validate reshape weights kernel
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights));
weights_to_use = &reshaped_weights;
}
- if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+ if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
{
// Validate convert weights kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use,
- &converted_weights,
- src->tensor_shape(),
- fc_info.weights_trained_layout));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(
+ weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
weights_to_use = &converted_weights;
}
- if(is_fc_after_conv)
+ if (is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
- ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
// Validate flatten kernel
ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src));
@@ -440,7 +481,8 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
}
// Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, fc_info.enable_fast_math, weights_info.weight_format()));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info,
+ fc_info.enable_fast_math, weights_info.weight_format()));
return Status{};
}
@@ -460,21 +502,21 @@ void CpuFullyConnected::run(ITensorPack &tensors)
CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
// Linearize src if it comes from a convolutional layer
- if(_is_fc_after_conv)
+ if (_is_fc_after_conv)
{
- ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+ ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
_flatten->run(flatten_pack);
}
ITensorPack gemm_pack = tensors;
gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
- if(_needs_weights_reshape || _needs_weights_conversion)
+ if (_needs_weights_reshape || _needs_weights_conversion)
{
gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
}
// Run matrix multiply
- if(_is_quantized_asymmetric)
+ if (_is_quantized_asymmetric)
{
_mm_gemmlowp->run(gemm_pack);
}
@@ -486,7 +528,7 @@ void CpuFullyConnected::run(ITensorPack &tensors)
void CpuFullyConnected::prepare(ITensorPack &tensors)
{
- if(!_is_prepared || _dynamic_weights)
+ if (!_is_prepared || _dynamic_weights)
{
#ifdef ARM_COMPUTE_ASSERTS_ENABLED
++_asrt_prepare_count;
@@ -502,20 +544,21 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
const ITensor *cur_weights = weights;
// Reshape of the weights (happens only once)
- if(_needs_weights_reshape)
+ if (_needs_weights_reshape)
{
// Run reshape weights kernel and mark weights as unused
- ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
- NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack);
+ ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
+ NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(),
+ transpose_pack);
cur_weights->mark_as_unused();
cur_weights = reshaped_weights.get();
}
// Convert weights if needed (happens only once)
- if(_needs_weights_conversion)
+ if (_needs_weights_conversion)
{
- ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+ ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
_convert_weights->run(convert_pack);
cur_weights->mark_as_unused();
@@ -526,7 +569,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
// Prepare GEMM prepare and release unused weights
- if(!_is_quantized_asymmetric)
+ if (!_is_quantized_asymmetric)
{
_mm_gemm->prepare(gemm_pack);
}
diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h
index 1e8c6478d0..7073fb9f7c 100644
--- a/src/cpu/operators/CpuFullyConnected.h
+++ b/src/cpu/operators/CpuFullyConnected.h
@@ -24,11 +24,11 @@
#ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H
#define ARM_COMPUTE_CPU_FULLY_CONNECTED_H
-#include "src/cpu/ICpuOperator.h"
-
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
+#include "src/cpu/ICpuOperator.h"
+
#include <memory>
namespace arm_compute
@@ -86,16 +86,24 @@ public:
* @param[in] fc_info (Optional) Fully connected layer additional info
* @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(),
+ const WeightsInfo &weights_info = WeightsInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected
*
* Similar to @ref CpuFullyConnected::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(),
+ const WeightsInfo &weights_info = WeightsInfo());
/** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
* weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
@@ -103,19 +111,35 @@ public:
*
* @return a status
*/
- static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
- const ITensorInfo *biases, const ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info, WeightsInfo weights_info);
+ static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info,
+ WeightsInfo weights_info);
//Inherited methods override
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
- void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
- void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
- void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
+ void configure_fc_fc(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act);
+ void configure_conv_fc(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act);
+ void configure_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act);
enum AuxTensorIdx
{
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
index 34b845928d..8da166dbef 100644
--- a/src/cpu/operators/CpuGemm.cpp
+++ b/src/cpu/operators/CpuGemm.cpp
@@ -24,9 +24,10 @@
#include "src/cpu/operators/CpuGemm.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
@@ -57,17 +58,25 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
}
} // namespace
-void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void CpuGemm::configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info));
ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info);
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
- const bool is_c_bias = beta == 1 && c != nullptr;
- bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
- (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
- !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+ const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+ const bool is_c_bias = beta == 1 && c != nullptr;
+ bool run_optimised =
+ bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
+ (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+ !(!b->are_values_constant() &&
+ b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
// Check if we need to reshape the matrix B only on the first run
_is_prepared = false;
@@ -76,9 +85,12 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
_run_alpha_scale = alpha != 1.f;
_run_bias_addition = is_c_bias;
_run_addition = beta != 0 && beta != 1 && c != nullptr;
- _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+ _run_activation =
+ gemm_info.activation_info().enabled() &&
+ (!run_optimised ||
+ (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
- if(run_optimised)
+ if (run_optimised)
{
const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
@@ -90,10 +102,11 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
_aux_mem[Pretraspose] = asm_mem_req[Pretraspose];
// Scale product by alpha
- if(_run_alpha_scale)
+ if (_run_alpha_scale)
{
_alpha_scale_func = std::make_unique<cpu::CpuActivation>();
- _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
+ _alpha_scale_func->configure(
+ d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
}
}
else
@@ -104,7 +117,7 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
_mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
// Select between GEMV and GEMM
- if(_run_vector_matrix_multiplication)
+ if (_run_vector_matrix_multiplication)
{
// Configure the matrix multiply kernel
_mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
@@ -118,41 +131,50 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
// Configure interleave kernel
_interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
_interleave_kernel->configure(a, &_tmp_a);
- _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
+ _aux_mem[InterleavedLHS] =
+ MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
// Configure transpose kernel
_transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
_transpose_kernel->configure(b, &_tmp_b);
- _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+ _aux_mem[TransposedRHS] =
+ MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
// Configure matrix multiplication kernel
_mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
}
- if(_run_bias_addition)
+ if (_run_bias_addition)
{
_add_bias = std::make_unique<cpu::CpuAdd>();
_add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
- _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
+ _aux_mem[TempResult] =
+ MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
}
}
// Configure matrix addition kernel
- if(_run_addition)
+ if (_run_addition)
{
_ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
_ma_kernel->configure(c, d, beta);
}
// Configure activation
- if(_run_activation)
+ if (_run_activation)
{
_activation_func = std::make_unique<cpu::CpuActivation>();
_activation_func->configure(d, nullptr, gemm_info.activation_info());
}
}
-Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CpuGemm::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_UNUSED(alpha);
const bool is_c_bias = beta == 1 && c != nullptr;
@@ -162,7 +184,7 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
- if(is_fixed_format_fast_math(gemm_info.weight_format()))
+ if (is_fixed_format_fast_math(gemm_info.weight_format()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
@@ -174,46 +196,54 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
const int block_by = arm_compute::block_by(gemm_info.weight_format());
// test if im2col has changed the dimensions that are needed for padding
- if(a->dimension(0) != b->dimension(1) && block_by > 1)
+ if (a->dimension(0) != b->dimension(1) && block_by > 1)
{
// have to verify bias
const size_t dim0_sz = a->dimension(0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (dim0_sz % block_by) != 0,
+ ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
// a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right
// b->dimension(1) = kernel_area * input_channel
// a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right
const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by;
const size_t kernel_area = (dim0_sz - b->dimension(1)) / input_pad_right;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz - kernel_area * input_pad_right) != b->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (dim0_sz - kernel_area * input_pad_right) != b->dimension(1),
+ "The product AB is defined only if A number of columns and B number of rows are related");
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ a->dimension(0) != b->dimension(1),
+ "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
}
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- if(a->data_type() != DataType::BFLOAT16)
+ if (a->data_type() != DataType::BFLOAT16)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d);
}
- if(run_addition)
+ if (run_addition)
{
ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1),
+ "The C matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0),
+ "The C matrix must have the same number of columns as the matrix B");
}
- if(d->total_size() != 0)
+ if (d->total_size() != 0)
{
// For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more.
ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 0)
+ if (gemm_info.depth_output_gemm3d() != 0)
{
- if(gemm_info.reinterpret_input_as_3d())
+ if (gemm_info.reinterpret_input_as_3d())
{
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2));
@@ -230,15 +260,19 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
}
// Check if we need to run the optimized assembly kernel
- cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
- const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
- (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
- !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
-
- if(!run_optimised)
+ cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+ const bool run_optimised =
+ bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
+ (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+ !(!b->are_values_constant() &&
+ b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+
+ if (!run_optimised)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(),
+ "CpuGemm cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0,
+ "CpuGemm cannot reinterpret the output tensor as 3D");
// Check if the first input tensor is a vector.
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
@@ -254,7 +288,8 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(
+ m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
@@ -263,39 +298,44 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
TensorInfo tmp_b_info{};
TensorInfo tmp_output_info = *d->clone();
- if(run_interleave_transpose)
+ if (run_interleave_transpose)
{
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
// Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(
+ *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
// Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(
+ *b, mult_transpose1xW_width)));
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
}
// Validate matrix multiply
- auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+ auto_init_if_empty(tmp_output_info,
+ matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(
+ *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(
+ matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
- if(is_c_bias)
+ if (is_c_bias)
{
ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE));
}
}
// Validate matrix addition kernel
- if(run_addition)
+ if (run_addition)
{
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta));
}
// Validate activation
const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(activation.enabled())
+ if (activation.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation));
}
@@ -312,15 +352,15 @@ void CpuGemm::run(ITensorPack &tensors)
auto c = tensors.get_const_tensor(ACL_SRC_2);
auto d = tensors.get_tensor(ACL_DST);
- if(_asm_glue && _asm_glue->is_configured())
+ if (_asm_glue && _asm_glue->is_configured())
{
// Pass c to asm dispatch only if it's the bias tensor
ITensorPack asm_pack = tensors;
asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr);
_asm_glue->run(asm_pack);
- if(_run_alpha_scale)
+ if (_run_alpha_scale)
{
- ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+ ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
_alpha_scale_func->run(pack);
}
}
@@ -330,18 +370,20 @@ void CpuGemm::run(ITensorPack &tensors)
CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
- ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } };
- if(!_run_vector_matrix_multiplication)
+ ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}};
+ if (!_run_vector_matrix_multiplication)
{
// Run interleave kernel
- ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } };
- NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack);
+ ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}};
+ NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(),
+ interleave_pack);
- if(!_reshape_b_only_on_first_run)
+ if (!_reshape_b_only_on_first_run)
{
// Run transpose kernel
- ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
- NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+ ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
+ NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
+ transpose_pack);
}
// Use reshaped matrices
@@ -349,48 +391,52 @@ void CpuGemm::run(ITensorPack &tensors)
mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
}
- NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack);
+ NEScheduler::get().schedule_op(_mm_kernel.get(),
+ _run_vector_matrix_multiplication ? Window::DimX : Window::DimY,
+ _mm_kernel->window(), mm_pack);
// Run bias addition kernel
- if(_run_bias_addition)
+ if (_run_bias_addition)
{
- ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } };
+ ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}};
_add_bias->run(pack);
}
}
// Run matrix addition kernel
- if(_run_addition)
+ if (_run_addition)
{
- ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } };
+ ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}};
NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack);
}
// Run activation function
- if(_run_activation)
+ if (_run_activation)
{
- ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+ ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
_activation_func->run(pack);
}
}
void CpuGemm::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
- if(_asm_glue && _asm_glue->is_configured())
+ if (_asm_glue && _asm_glue->is_configured())
{
_asm_glue->prepare(tensors);
}
- else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
+ else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
{
- const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
- ITensor *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
+ const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *b_aux =
+ utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
- ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
- NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+ ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
+ NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
+ transpose_pack);
}
_is_prepared = true;
}
@@ -401,8 +447,12 @@ experimental::MemoryRequirements CpuGemm::workspace() const
return _aux_mem;
}
-Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
- const GEMMInfo &gemm_info)
+Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const GEMMInfo &gemm_info)
{
const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h
index 9b08e5d0f6..6b30d134fa 100644
--- a/src/cpu/operators/CpuGemm.h
+++ b/src/cpu/operators/CpuGemm.h
@@ -24,12 +24,12 @@
#ifndef ARM_COMPUTE_CPU_GEMM_H
#define ARM_COMPUTE_CPU_GEMM_H
-#include "src/cpu/ICpuOperator.h"
-
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/function_info/GEMMInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
@@ -93,16 +93,26 @@ public:
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should happen only for the first run
*/
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CpuGemm.
*
* Similar to @ref CpuGemm::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
- float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
*
@@ -111,12 +121,16 @@ public:
* the value of arm_compute::WeightFormat need to be passed via the
* parameter gemm_info.
*/
- static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
- const GEMMInfo &gemm_info = GEMMInfo());
+ static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const GEMMInfo &gemm_info = GEMMInfo());
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
/** Indicates if the convolution executes in variable weights mode.
@@ -138,28 +152,28 @@ private:
Count
};
- std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{ nullptr };
- std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose_kernel{ nullptr };
- std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{ nullptr };
- std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{ nullptr };
- std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{ nullptr };
- std::unique_ptr<CpuActivation> _alpha_scale_func{ nullptr };
- std::unique_ptr<CpuAdd> _add_bias{ nullptr };
- std::unique_ptr<CpuActivation> _activation_func{ nullptr };
+ std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{nullptr};
+ std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose_kernel{nullptr};
+ std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr};
+ std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr};
+ std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr};
+ std::unique_ptr<CpuActivation> _alpha_scale_func{nullptr};
+ std::unique_ptr<CpuAdd> _add_bias{nullptr};
+ std::unique_ptr<CpuActivation> _activation_func{nullptr};
TensorInfo _tmp_a{};
TensorInfo _tmp_b{};
TensorInfo _tmp_d{};
- bool _run_vector_matrix_multiplication{ false };
- bool _run_alpha_scale{ false };
- bool _run_addition{ false };
- bool _run_bias_addition{ false };
- bool _run_activation{ false };
- bool _reshape_b_only_on_first_run{ false };
- bool _is_prepared{ false };
+ bool _run_vector_matrix_multiplication{false};
+ bool _run_alpha_scale{false};
+ bool _run_addition{false};
+ bool _run_bias_addition{false};
+ bool _run_activation{false};
+ bool _reshape_b_only_on_first_run{false};
+ bool _is_prepared{false};
- experimental::MemoryRequirements _aux_mem{ Count };
+ experimental::MemoryRequirements _aux_mem{Count};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index 39b410d609..7c59d88c61 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -26,9 +26,9 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
@@ -52,8 +52,11 @@ namespace arm_compute
{
namespace cpu
{
-CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info)
+CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info)
{
const DataLayout data_layout = src->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -62,63 +65,86 @@ CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src,
const unsigned int kernel_height = weights->dimension(idx_height);
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
- src->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
- const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
- if(skip_im2col)
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+ kernel_height, conv_info, dilation);
+ const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+ conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+ if (skip_im2col)
{
- const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
- if(skip_col2im)
+ const bool skip_col2im =
+ (data_layout == DataLayout::NHWC &&
+ (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
+ if (skip_col2im)
{
- return { true, true };
+ return {true, true};
}
}
else
{
- const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
- if(skip_col2im)
+ const bool skip_col2im =
+ (data_layout == DataLayout::NHWC &&
+ (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
+ if (skip_col2im)
{
- return { false, true };
+ return {false, true};
}
}
// Default case when we cannot reinterpret the input and output as 3D.
- return { false, false };
+ return {false, false};
}
CpuGemmConv2d::CpuGemmConv2d()
- : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
- _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+ : _weights_reshape_kernel(nullptr),
+ _im2col_kernel(),
+ _mm_gemm(),
+ _mm_gemmlowp(),
+ _col2im_kernel(),
+ _reshape(),
+ _im2col_output(),
+ _weights_reshaped(),
+ _gemm_output(),
+ _gemm_output_3d(),
+ _data_layout(DataLayout::NCHW),
+ _skip_im2col(false),
+ _skip_col2im(false),
+ _is_quantized(false),
+ _is_prepared(false),
+ _aux_mem(AuxTensorIdx::Count)
{
}
CpuGemmConv2d::~CpuGemmConv2d() = default;
-void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info,
- bool enable_fast_math, int gemm_3d_depth, bool fixed_format, arm_compute::WeightFormat weight_format)
+void CpuGemmConv2d::configure_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ int gemm_3d_depth,
+ bool fixed_format,
+ arm_compute::WeightFormat weight_format)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col, fixed_format, weight_format));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth,
+ _skip_im2col, fixed_format, weight_format));
// Create GEMMInfo structure
- const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
+ const GEMMInfo &gemm_info =
+ GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+ _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+ false, enable_fast_math, false, act_info, fixed_format, weight_format);
// Supported activations in GEMM
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+ ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
- if(_is_quantized)
+ if (_is_quantized)
{
- TensorInfo tmp_src{ *src };
- TensorInfo tmp_weights{ *weights };
+ TensorInfo tmp_src{*src};
+ TensorInfo tmp_weights{*weights};
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
const QuantizationInfo iqinfo = src->quantization_info();
@@ -129,7 +155,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
const DataType data_type = src->data_type();
tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
- if(!is_data_type_quantized_per_channel(tmp_weights.data_type()))
+ if (!is_data_type_quantized_per_channel(tmp_weights.data_type()))
{
const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
@@ -142,7 +168,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
int32_t min_activation = type_min.get<int32_t>();
int32_t max_activation = type_max.get<int32_t>();
- if(supported_acts.count(act_info.activation()) != 0)
+ if (supported_acts.count(act_info.activation()) != 0)
{
std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
}
@@ -156,11 +182,12 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
_mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
- _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format,
- weight_format));
+ _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst,
+ GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false,
+ enable_fast_math, false, act_info, fixed_format, weight_format));
auto mm_mem_req = _mm_gemmlowp->workspace();
- for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
{
_aux_mem[cont] = mm_mem_req[cont];
}
@@ -171,26 +198,35 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
_mm_gemm = std::make_unique<CpuGemm>();
_mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
auto mm_mem_req = _mm_gemm->workspace();
- for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
{
_aux_mem[cont] = mm_mem_req[cont];
}
}
}
-Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col, bool fixed_format, arm_compute::WeightFormat weight_format)
+Status CpuGemmConv2d::validate_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ int gemm_3d_depth,
+ bool skip_im2col,
+ bool fixed_format,
+ arm_compute::WeightFormat weight_format)
{
const DataType data_type = src->data_type();
const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
const bool is_activation_enabled = act_info.enabled();
// Create GEMMInfo structure
- const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
+ const GEMMInfo gemm_info =
+ GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+ skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+ false, enable_fast_math, false, act_info, fixed_format, weight_format);
- if(is_quantized)
+ if (is_quantized)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
@@ -206,11 +242,10 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
int32_t min_activation = type_min.get<int32_t>();
int32_t max_activation = type_max.get<int32_t>();
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+ ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+ if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
{
std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
}
@@ -229,8 +264,9 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
- return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, enable_fast_math,
- false, act_info));
+ return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst,
+ GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false,
+ output_info, false, enable_fast_math, false, act_info));
}
else
{
@@ -239,36 +275,44 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
}
}
-Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const ActivationLayerInfo &act_info,
+ int gemm_3d_depth,
+ bool skip_im2col)
{
const DataType data_type = input_info->data_type();
const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
// Set dummy tensor shapes for the validation
- const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
+ const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type,
+ input_info->quantization_info());
const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
- const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+ const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type,
+ input_info->quantization_info());
- return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col);
+ return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false,
+ gemm_3d_depth, skip_im2col);
}
-void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuGemmConv2d::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_UNUSED(num_groups, weights_info);
- ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src,
- weights,
- biases,
- dst,
- conv_info,
- weights_info,
- dilation,
- act_info,
- enable_fast_math,
- num_groups));
- ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation,
+ act_info, enable_fast_math, num_groups));
+ ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math,
+ num_groups);
const DataType data_type = src->data_type();
const DataLayout data_layout = src->data_layout();
@@ -283,7 +327,8 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
_is_prepared = weights_info.retain_internal_weights();
_is_quantized = is_data_type_quantized_asymmetric(src->data_type());
_data_layout = data_layout;
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+ conv_info.stride().first == 1 && conv_info.stride().second == 1);
const ITensorInfo *gemm_input_to_use = src;
ITensorInfo *gemm_output_to_use = dst;
@@ -291,20 +336,17 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
- src->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+ kernel_height, conv_info, dilation);
ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
"Output shape does not match the expected one");
// Check if GEMM3D is supported
- const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
- _skip_im2col = skip_info.skip_im2col;
- _skip_col2im = skip_info.skip_col2im;
+ const CpuGemmConv2d::SkipInfo skip_info =
+ CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+ _skip_im2col = skip_info.skip_im2col;
+ _skip_col2im = skip_info.skip_col2im;
// Get parameters from conv_info
unsigned int stride_x = 0;
@@ -320,17 +362,19 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
_weights_reshaped.set_quantization_info(weights->quantization_info());
// Create tensor to store im2col reshaped inputs
- if(!_skip_im2col)
+ if (!_skip_im2col)
{
const int block_by = arm_compute::block_by(weights_info.weight_format());
unsigned int input_pad_right = 0;
- if(block_by > 1)
+ if (block_by > 1)
{
- input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+ input_pad_right =
+ (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
}
// Configure
_im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
- _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, num_groups, input_pad_right);
+ _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation,
+ num_groups, input_pad_right);
// Update GEMM input
gemm_input_to_use = &_im2col_output;
@@ -338,7 +382,7 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
// Create temporary GEMM output tensor in case we cannot skip col2im
const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!_skip_col2im)
+ if (!_skip_col2im)
{
TensorShape shape_gemm;
@@ -368,9 +412,10 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
// In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
- configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth, fixed_format, weights_info.weight_format());
+ configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math,
+ gemm_3d_depth, fixed_format, weights_info.weight_format());
- if(!_skip_col2im && _data_layout == DataLayout::NCHW)
+ if (!_skip_col2im && _data_layout == DataLayout::NCHW)
{
// Configure col2im
_col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
@@ -390,14 +435,24 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS
// Check lifetime
- _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
- _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size());
- _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+ _aux_mem[Im2ColOutput] =
+ MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+ _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped),
+ gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent,
+ _weights_reshaped.total_size());
+ _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
}
-Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math)
+Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ const bool enable_fast_math)
{
const DataLayout data_layout = src->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -406,36 +461,44 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo
const unsigned int kernel_height = weights->dimension(idx_height);
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
- src->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+ kernel_height, conv_info, dilation);
- const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
- dilation, act_info);
+ const CpuGemmConv2d::SkipInfo skip_info =
+ CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
const bool skip_im2col = skip_info.skip_im2col;
const bool skip_col2im = skip_info.skip_col2im;
const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0;
const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
- const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
+ const GEMMInfo gemm_info =
+ GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+ skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+ false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
}
-Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuGemmConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+ DataType::F16, DataType::F32);
- if(!is_fixed_format(weights_info.weight_format()))
+ if (!is_fixed_format(weights_info.weight_format()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
}
@@ -468,29 +531,25 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
- src->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+ kernel_height, conv_info, dilation);
// Check if GEMM3D is supported
- const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
- dilation, act_info);
- const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
+ const CpuGemmConv2d::SkipInfo skip_info =
+ CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+ const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
// Validate biases
- if(biases != nullptr)
+ if (biases != nullptr)
{
- if(is_quantized)
+ if (is_quantized)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
- else if(is_bf16)
+ else if (is_bf16)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
}
@@ -503,20 +562,23 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
}
unsigned int mat_weights_cols = weights->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
+ unsigned int mat_weights_rows =
+ weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, weights->data_type());
weights_reshaped_info.set_quantization_info(weights->quantization_info());
weights_to_use = &weights_reshaped_info;
- if(!skip_im2col)
+ if (!skip_im2col)
{
const int block_by = arm_compute::block_by(weights_info.weight_format());
int input_pad_right = 0;
- if(block_by > 1)
+ if (block_by > 1)
{
- input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
- mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * (weights->dimension(idx_channel) + input_pad_right);
+ input_pad_right =
+ (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+ mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) *
+ (weights->dimension(idx_channel) + input_pad_right);
}
// Create tensor info for im2col reshaped inputs
@@ -528,13 +590,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
im2col_reshaped_info.set_quantization_info(src->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups, input_pad_right));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height),
+ conv_info, append_bias, dilation, num_groups, input_pad_right));
gemm_input_to_use = &im2col_reshaped_info;
}
// Create temporary GEMM output tensor in case we cannot skip col2im
const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!skip_col2im)
+ if (!skip_col2im)
{
TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
@@ -549,13 +613,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
gemm_output_to_use = &info_gemm;
const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info,
+ enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
weights_info.weight_format()));
// Validate Col2Im/ReshapeLayer
- if(!skip_col2im && (data_layout == DataLayout::NCHW))
+ if (!skip_col2im && (data_layout == DataLayout::NCHW))
{
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
}
return Status{};
@@ -574,15 +640,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
- if(!_skip_im2col)
+ if (!_skip_im2col)
{
// Run input reshaping
unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, src },
- { TensorType::ACL_DST, im2col_output.get() }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
gemm_input_to_use = im2col_output.get();
}
@@ -595,11 +657,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
gemm3d.allocator()->import_memory(out_to_use->buffer());
auto gemm_output_to_use = gemm_output.get();
- if(_skip_im2col)
+ if (_skip_im2col)
{
gemm_output_to_use = &gemm3d;
}
- if(_skip_col2im && !out_has_padding)
+ if (_skip_col2im && !out_has_padding)
{
gemm_output_to_use = dst;
}
@@ -607,12 +669,12 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
// Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
ITensorPack pack_mm = tensors;
pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
- if(!this->isVarWeightsKernel())
+ if (!this->isVarWeightsKernel())
{
pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
}
pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
- if(_is_quantized)
+ if (_is_quantized)
{
// Run gemmlowp
_mm_gemmlowp->run(pack_mm);
@@ -624,45 +686,33 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
}
// Reshape output matrix
- if(!_skip_col2im)
+ if (!_skip_col2im)
{
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, gemm_output.get() },
- { TensorType::ACL_DST, dst }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}};
NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
}
else
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, gemm_output_to_use },
- { TensorType::ACL_DST, dst }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
_reshape->run(pack);
}
}
- else if(out_has_padding)
+ else if (out_has_padding)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, gemm_output_to_use },
- { TensorType::ACL_DST, dst }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
_reshape->run(pack);
}
}
void CpuGemmConv2d::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// Variable weights executions that use fixed-format kernels
// need no reshaping of the weights.
- if(this->isVarWeightsKernel())
+ if (this->isVarWeightsKernel())
{
_is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
_is_prepared = true;
@@ -672,11 +722,7 @@ void CpuGemmConv2d::prepare(ITensorPack &tensors)
// Run weights reshaping and mark original weights tensor as unused
CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, weights },
- { TensorType::ACL_DST, weights_reshaped.get() }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
weights->mark_as_unused();
ITensorPack gemm_pack = tensors;
diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h
index 61fe63a79f..118d366517 100644
--- a/src/cpu/operators/CpuGemmConv2d.h
+++ b/src/cpu/operators/CpuGemmConv2d.h
@@ -27,6 +27,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
#include <memory>
@@ -106,17 +107,32 @@ public:
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuGemmConvolution::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- bool enable_fast_math = false, unsigned int num_groups = 1);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
*
@@ -124,10 +140,16 @@ public:
*
* @return a status.
*/
- static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- const bool enable_fast_math = false);
+ static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const bool enable_fast_math = false);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
@@ -150,8 +172,15 @@ private:
* @param[in] fixed_format (Optional) Select GEMM execution with variable weights.
* @param[in] weight_format (Optional) The layout to be used for the weights tensor when running GEMM with variable weights.
*/
- void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- bool enable_fast_math = false, int gemm_3d_depth = 1, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+ void configure_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ int gemm_3d_depth = 1,
+ bool fixed_format = false,
+ arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
*
* @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -170,8 +199,16 @@ private:
*
* @return a status
*/
- static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+ static Status validate_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ int gemm_3d_depth = 1,
+ bool skip_im2col = false,
+ bool fixed_format = false,
+ arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
/** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
*
* @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -182,7 +219,11 @@ private:
*
* @return a status
*/
- static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+ static Status validate_gemm3d(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ActivationLayerInfo &act_info,
+ int gemm_3d_depth,
+ bool skip_im2col);
struct SkipInfo
{
@@ -200,8 +241,11 @@ private:
*
* @return a SkipInfo instance.
*/
- static SkipInfo skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info);
+ static SkipInfo skip_im_col_info(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info);
/** Indicates if the convolution executes in variable weights mode.
*
@@ -236,7 +280,7 @@ private:
bool _is_quantized;
bool _is_prepared;
- experimental::MemoryRequirements _aux_mem{ Count };
+ experimental::MemoryRequirements _aux_mem{Count};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp
index 5ce285cb6f..8fa81b1907 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -26,10 +26,10 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/cpu/utils/CpuAuxTensorHandler.h"
-
#include "support/Cast.h"
#include <set>
@@ -43,7 +43,10 @@ using namespace arm_compute::utils::cast;
namespace
{
-GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
@@ -53,16 +56,15 @@ GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src,
const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
const DataType data_type = src->data_type();
// Merge activation with output stage
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- PixelValue type_min{};
- PixelValue type_max{};
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+ ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+ PixelValue type_min{};
+ PixelValue type_max{};
std::tie(type_min, type_max) = get_min_max(data_type);
int32_t min_activation = type_min.get<int32_t>();
int32_t max_activation = type_max.get<int32_t>();
- if(supported_acts.count(act.activation()) != 0)
+ if (supported_acts.count(act.activation()) != 0)
{
std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
}
@@ -107,31 +109,32 @@ CpuGemmDirectConv2d::CpuGemmDirectConv2d()
CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
-void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info)
+void CpuGemmDirectConv2d::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const Conv2dInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src,
- weights,
- biases != nullptr ? biases : nullptr,
- dst,
- info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info));
ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
_run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info);
_is_prepared = false;
- _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 });
+ _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2});
// Configure assembly dispatch
cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
- if(is_data_type_quantized(src->data_type()))
+ if (is_data_type_quantized(src->data_type()))
{
asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
}
_gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info);
// Configure activation
- if(_run_activation)
+ if (_run_activation)
{
_activation_func->configure(dst, nullptr, info.act_info);
}
@@ -141,24 +144,33 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w
_aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
_aux_mem[Pretranspose] = asm_mem_req[Pretranspose];
- if(_aux_mem[Pretranspose].size > 0)
+ if (_aux_mem[Pretranspose].size > 0)
{
// Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
- _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
+ _aux_mem[PermutedWeights] =
+ MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
}
else
{
// We must permute weights if they are WeightFormat::UNSPECIFIED
- if(info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
- _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
+ if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
+ _aux_mem[PermutedWeights] =
+ MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
}
}
-Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
+Status CpuGemmDirectConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const Conv2dInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
- if(!is_fixed_format(info.weights_info.weight_format()))
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+ DataType::F16, DataType::F32);
+ if (!is_fixed_format(info.weights_info.weight_format()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
}
@@ -171,13 +183,13 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *
ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
// Validate biases
- if(biases != nullptr)
+ if (biases != nullptr)
{
- if(is_data_type_quantized_asymmetric(data_type))
+ if (is_data_type_quantized_asymmetric(data_type))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
- else if(data_type == DataType::BFLOAT16)
+ else if (data_type == DataType::BFLOAT16)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
}
@@ -198,31 +210,32 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors)
prepare(tensors);
_gemm_asm_func->run(tensors);
- if(_run_activation)
+ if (_run_activation)
{
ITensor *io = tensors.get_tensor(ACL_DST);
- ITensorPack pack{ { ACL_SRC, io }, { ACL_DST, io } };
+ ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}};
_activation_func->run(pack);
}
}
void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// If we are using fixed-format kernel the weights are already reshaped
- if(_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
+ if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
{
_gemm_asm_func->prepare(tensors);
_is_prepared = true;
return;
}
- const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
- ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+ const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *weights_aux =
+ utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux);
- ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+ ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
_weights_permute_func->run(permute_tensors);
tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get());
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h
index e55a461f36..1cc3caadae 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.h
+++ b/src/cpu/operators/CpuGemmDirectConv2d.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
#include "arm_compute/core/TensorInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
#include "src/cpu/operators/CpuActivation.h"
@@ -69,18 +70,26 @@ public:
* Data types supported: Same as @p input.
* @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info);
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const Conv2dInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
*
* Similar to CpuGemmDirectConv2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const Conv2dInfo &info);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 8ca128fb07..2ee879b67b 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -28,14 +28,14 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
@@ -59,12 +59,12 @@ namespace
cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
{
cpu::AsmGemmInfo asm_info;
- asm_info.method = cpu::AsmConvMethod::Im2Col;
- asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
- asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
- asm_info.activation_info = info.activation_info();
- asm_info.output_stage = info.gemmlowp_output_stage();
- asm_info.fast_mode = info.fast_math();
+ asm_info.method = cpu::AsmConvMethod::Im2Col;
+ asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+ asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
+ asm_info.activation_info = info.activation_info();
+ asm_info.output_stage = info.gemmlowp_output_stage();
+ asm_info.fast_mode = info.fast_math();
return asm_info;
}
@@ -105,7 +105,8 @@ CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
}
CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
-void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
+void CpuGemmLowpMatrixMultiplyCore::configure(
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
@@ -122,28 +123,31 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
_reshape_b_only_on_first_run = b->are_values_constant();
_is_prepared = false;
_fused_assembly_path = false;
- _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
- _gemm_info = gemm_info;
+ _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&
+ _reshape_b_only_on_first_run;
+ _gemm_info = gemm_info;
_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
const ITensorInfo *a_to_use = a;
// Convert to QASYMM8 -> QASYMM8_SIGNED and back
- if(_flip_signedness)
+ if (_flip_signedness)
{
const int32_t offset_correction = 128;
const DataType dt = DataType::QASYMM8_SIGNED;
const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
- _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+ QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
_convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
_convert_to_signed_asymm->configure(a_to_use, &_signed_a);
a_to_use = &_signed_a;
_a_offset = _signed_a.quantization_info().uniform().offset;
const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
- _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+ _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(
+ QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
// Output stage correction
GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -157,7 +161,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
}
// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
{
_fuse_output_stage = true;
_mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
@@ -166,16 +170,18 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
// Initialize assembly kernel meta-data
const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
#ifdef __aarch64__
- if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+ if (!(!b->are_values_constant() &&
+ b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
{
- switch(a->data_type())
+ switch (a->data_type())
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
case DataType::U8:
case DataType::S8:
{
- if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+ info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
auto c_info_to_use = c == nullptr ? nullptr : c;
_asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
@@ -197,13 +203,14 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
}
}
#endif /* __aarch64__ */
- if(!(_assembly_path || _run_vector_matrix_multiplication))
+ if (!(_assembly_path || _run_vector_matrix_multiplication))
{
matrix_a = &_tmp_a;
matrix_b = &_tmp_b;
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
+ _tmp_a =
+ TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
_tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
@@ -216,13 +223,13 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
_mtx_b_reshape_kernel->configure(b, &_tmp_b);
}
- if(!_fused_assembly_path)
+ if (!_fused_assembly_path)
{
// Build reduction info
const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0)
+ if (_a_offset != 0)
{
_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
@@ -232,7 +239,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
}
// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
+ if (_b_offset != 0)
{
_vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
@@ -241,24 +248,23 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
_mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
}
- if(_fuse_output_stage)
+ if (_fuse_output_stage)
{
// Configure matrix multiply kernel
- if(!_assembly_path)
+ if (!_assembly_path)
{
_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
_mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
}
- _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
- _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
- _a_offset == 0 ? nullptr : &_vector_sum_col,
- _b_offset == 0 ? nullptr : &_vector_sum_row, c,
- _flip_signedness ? &_signed_output : dst,
- a->dimension(0),
- _a_offset, _b_offset, info.gemmlowp_output_stage());
+ _offset_contribution_output_stage_kernel =
+ std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
+ _offset_contribution_output_stage_kernel->configure(
+ &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst,
+ a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());
- if(_flip_signedness)
+ if (_flip_signedness)
{
_convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
_convert_from_signed_asymm->configure(&_signed_output, dst);
@@ -267,27 +273,29 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
else
{
// Configure matrix multiply kernel
- if(!_assembly_path)
+ if (!_assembly_path)
{
_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
_mm_kernel->configure(matrix_a, matrix_b, dst);
}
// Configure offset contribution kernel
_offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
- _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
+ _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
_a_offset, _b_offset);
}
}
// Configure activation
const ActivationLayerInfo &activation = gemm_info.activation_info();
- _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
- if(_run_activation)
+ _run_activation =
+ activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
+ if (_run_activation)
{
_activation_func = std::make_unique<CpuActivation>();
_activation_func->configure(dst, nullptr, activation);
}
- if(_assembly_path)
+ if (_assembly_path)
{
auto asm_mem_req = _asm_glue->workspace();
_aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
@@ -295,27 +303,41 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
}
// Request memory for LHS and RHS reshape matrix
- _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
- && _reshape_b_only_on_first_run ?
- MemoryLifetime::Persistent :
- MemoryLifetime::Temporary,
- _vector_sum_col.total_size());
- _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
- _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
- _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
- _aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
- _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
- _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
+ _aux_mem[VectorSumCol] =
+ MemoryInfo(offset_int_vec(VectorSumCol),
+ !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent
+ : MemoryLifetime::Temporary,
+ _vector_sum_col.total_size());
+ _aux_mem[VectorSumRow] =
+ MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+ _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
+ _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),
+ _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+ _tmp_b.total_size());
+ _aux_mem[MMResultS32] =
+ MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+ _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
+ _aux_mem[SignedOutput] =
+ MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
}
-Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
- "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr &&
+ gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+ "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (a)->dimension(0) != (b)->dimension(1),
+ "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
@@ -333,28 +355,32 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
int32_t b_offset = b->quantization_info().uniform().offset;
bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
- if(fuse_output_stage)
+ if (fuse_output_stage)
{
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ auto_init_if_empty(mm_result_s32_info,
+ a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
}
// Convert QASYMM8->QASYMM8_SIGNED
TensorInfo signed_a{};
TensorInfo signed_output{};
- bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
- if(flip_signedness)
+ bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&
+ (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+ if (flip_signedness)
{
const int32_t offset_correction = 128;
const DataType dt = DataType::QASYMM8_SIGNED;
const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
- signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+ QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
a_to_use = &signed_a;
a_offset = signed_a.quantization_info().uniform().offset;
const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
- signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+ signed_output = output->clone()->set_data_type(dt).set_quantization_info(
+ QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
// Output stage correction
GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -374,25 +400,28 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
bool run_optimised = false;
bool run_optimised_requantized = false;
- if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+ if (!(!b->are_values_constant() &&
+ b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
{
- if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+ info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
run_optimised_requantized = run_optimised;
}
else
{
- run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
+ run_optimised = bool(CpuGemmAssemblyDispatch::validate(
+ a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
}
}
- if(run_optimised)
+ if (run_optimised)
{
ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(info.depth_output_gemm3d() != 0)
+ if (info.depth_output_gemm3d() != 0)
{
- if(info.reinterpret_input_as_3d())
+ if (info.reinterpret_input_as_3d())
{
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -409,11 +438,13 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+ "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+ "NEGEMM cannot reinterpret the output tensor as 3D");
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
- if(!run_vector_matrix_multiplication)
+ if (!run_vector_matrix_multiplication)
{
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
@@ -437,7 +468,7 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
}
}
- if(!run_optimised_requantized)
+ if (!run_optimised_requantized)
{
TensorInfo info_vector_sum_col{};
TensorInfo info_vector_sum_row{};
@@ -445,62 +476,70 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
// Validate matrix B reduction kernel only if _a_offset is not equal to 0
- if(a_offset != 0)
+ if (a_offset != 0)
{
info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
// Configure Matrix B reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
}
// Validate Matrix A reduction kernel only if _b_offset is not equal to 0
- if(b_offset != 0)
+ if (b_offset != 0)
{
info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
// Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
}
- if(fuse_output_stage)
+ if (fuse_output_stage)
{
- if(!run_optimised)
+ if (!run_optimised)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ info.reinterpret_input_as_3d(),
+ "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ info.depth_output_gemm3d() != 0,
+ "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(
+ matrix_a_info, matrix_b_info, &mm_result_s32_info));
}
// Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- flip_signedness ? &signed_output : output,
- a_offset, b_offset,
- info.gemmlowp_output_stage()));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(
+ &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset,
+ b_offset, info.gemmlowp_output_stage()));
}
else
{
- if(!run_optimised)
+ if (!run_optimised)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ info.reinterpret_input_as_3d(),
+ "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ info.depth_output_gemm3d() != 0,
+ "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
}
// Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- a_offset, b_offset));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(
+ output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+ a_offset, b_offset));
}
}
// Validate activation
const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(activation.enabled())
+ if (activation.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
}
@@ -529,24 +568,22 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
// Convert QASYMM8->QASYMM8_SIGNED
- if(_flip_signedness)
+ if (_flip_signedness)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, a },
- { TensorType::ACL_DST, signed_a.get() }
- };
- NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};
+ NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),
+ pack);
a_to_use = signed_a.get();
matrix_a = signed_a.get();
}
// Run GEMM
- if(_asm_glue->is_configured())
+ if (_asm_glue->is_configured())
{
ITensorPack asm_glue_tensors = tensors;
auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);
- if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&
+ _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
@@ -563,35 +600,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
}
else
{
- if(!_run_vector_matrix_multiplication)
+ if (!_run_vector_matrix_multiplication)
{
matrix_a = tmp_a.get();
matrix_b = tmp_b.get();
// Run interleave kernel
- ITensorPack pack_a =
- {
- { TensorType::ACL_SRC, a_to_use },
- { TensorType::ACL_DST, tmp_a.get() }
- };
- NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
+ ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};
+ NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),
+ pack_a);
- if(!_reshape_b_only_on_first_run)
+ if (!_reshape_b_only_on_first_run)
{
- ITensorPack pack_b =
- {
- { TensorType::ACL_SRC, b },
- { TensorType::ACL_DST, tmp_b.get() }
- };
+ ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};
// Run transpose kernel
- NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
+ NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,
+ _mtx_b_reshape_kernel->window(), pack_b);
}
}
- ITensorPack pack_mm =
- {
- { TensorType::ACL_SRC_0, matrix_a },
- { TensorType::ACL_SRC_1, matrix_b }
- };
- if(_fuse_output_stage)
+ ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};
+ if (_fuse_output_stage)
{
pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
}
@@ -602,31 +629,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
}
- if(!_fused_assembly_path)
+ if (!_fused_assembly_path)
{
// Run matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
+ if (_b_offset != 0)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, a_to_use },
- { TensorType::ACL_DST, vector_sum_row.get() }
- };
- NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};
+ NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,
+ _mtx_a_reduction_kernel->window(), pack);
}
// Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+ if (_a_offset != 0 && !_reshape_b_only_on_first_run)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, b },
- { TensorType::ACL_DST, vector_sum_col.get() }
- };
- NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+ NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+ _mtx_b_reduction_kernel->window(), pack);
}
- if(_fuse_output_stage)
+ if (_fuse_output_stage)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
@@ -636,7 +657,8 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
// Run offset contribution kernel
- NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
+ NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,
+ _offset_contribution_output_stage_kernel->window(), pack);
}
else
{
@@ -646,68 +668,57 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
pack.add_tensor(TensorType::ACL_DST, dst);
// Run offset contribution kernel
- NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
+ NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,
+ _offset_contribution_kernel->window(), pack);
}
}
// Convert QASYMM8_SIGNED->QASYMM8
- if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
+ if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, signed_output.get() },
- { TensorType::ACL_DST, dst }
- };
- NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};
+ NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,
+ _convert_from_signed_asymm->window(), pack);
}
// Run fused activation unless already run in the fused assembly
- if(_run_activation)
+ if (_run_activation)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, dst },
- { TensorType::ACL_DST, dst }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
_activation_func->run(pack);
}
}
void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
// Run assembly reshape
- if(_asm_glue->is_configured())
+ if (_asm_glue->is_configured())
{
_asm_glue->prepare(tensors);
}
// Run non-assembly reshape
- else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+ else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
{
// Run reshape kernel and mark original weights tensor as unused
- ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
+ ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, original_b },
- { TensorType::ACL_DST, tmp_b.get() }
- };
- NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};
+ NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),
+ pack);
}
// Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+ if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
{
- ITensor *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
+ ITensor *vector_sum_col_p =
+ utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, original_b },
- { TensorType::ACL_DST, vector_sum_col.get() }
- };
- NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+ NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+ _mtx_b_reduction_kernel->window(), pack);
}
_is_prepared = true;
}
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
index a1b34291d0..a7798938e7 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/function_info/GEMMInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -108,18 +109,26 @@ public:
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should be executed only for the first run
*/
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *dst,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuGemmLowpMatrixMultiplyCore::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *dst,
+ const GEMMInfo &gemm_info = GEMMInfo());
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
index 58f98acff0..4215eed199 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.cpp
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
@@ -36,36 +37,42 @@ namespace arm_compute
{
namespace cpu
{
-void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+void CpuGemmLowpOutputStage::configure(ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo &info)
{
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info));
ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
- switch(info.type)
+ switch (info.type)
{
case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
{
- switch(info.output_data_type)
+ switch (info.output_data_type)
{
case DataType::QASYMM8:
{
auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
- k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+ info.gemmlowp_min_bound, info.gemmlowp_max_bound);
_kernel = std::move(k);
break;
}
case DataType::QASYMM8_SIGNED:
{
auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
- k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+ info.gemmlowp_min_bound, info.gemmlowp_max_bound);
_kernel = std::move(k);
break;
}
case DataType::QSYMM16:
{
auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
- k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound,
+ info.gemmlowp_max_bound);
_kernel = std::move(k);
break;
}
@@ -79,7 +86,7 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen
}
case GEMMLowpOutputStageType::QUANTIZE_DOWN:
{
- switch(info.output_data_type)
+ switch (info.output_data_type)
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
@@ -102,32 +109,41 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen
}
}
-Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
- ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN,
+ "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) &&
+ (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
- switch(info.type)
+ switch (info.type)
{
case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
{
- switch(dst->data_type())
+ switch (dst->data_type())
{
case DataType::QASYMM8:
- return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+ src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
case DataType::QASYMM8_SIGNED:
- return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+ src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
case DataType::QSYMM16:
- return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+ src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
default:
return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
}
}
case GEMMLowpOutputStageType::QUANTIZE_DOWN:
{
- switch(dst->data_type())
+ switch (dst->data_type())
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
@@ -146,4 +162,4 @@ void CpuGemmLowpOutputStage::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h
index 39394f6b5f..e5e2f41fa9 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.h
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
#include "arm_compute/core/Types.h"
+
#include "src/cpu/ICpuOperator.h"
/** This file contains all available output stages for GEMMLowp.
@@ -76,7 +77,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo &info);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 8811a7ea6b..89087129c3 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -23,14 +23,16 @@
*/
#include "src/cpu/operators/CpuMatMul.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+
#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/function_info/MatMulInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
@@ -46,8 +48,11 @@ namespace cpu
{
namespace
{
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
- GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act,
+ GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
{
const auto data_type = src->data_type();
const QuantizationInfo oq_info = dst->quantization_info();
@@ -59,10 +64,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
int32_t output_multiplier;
int32_t output_shift;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
- int32_t type_min = 0;
- int32_t type_max = 0;
+ int32_t type_min = 0;
+ int32_t type_max = 0;
std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -77,14 +83,27 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
} // namespace
CpuMatMul::CpuMatMul()
- : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape()
+ : _transpose_kernel_lhs(),
+ _transpose_kernel_rhs(),
+ _asm_glue(),
+ _lhs_transposed(),
+ _rhs_transposed(),
+ _original_lhs_shape(),
+ _original_rhs_shape(),
+ _original_dst_shape()
{
}
-Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+Status CpuMatMul::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic.");
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
@@ -103,34 +122,39 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
gemm_info.fast_mode = settings.fast_math();
// Validate and then permute a/b
- if(adj_lhs)
+ if (adj_lhs)
{
- auto_init_if_empty(lhs_transposed, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
+ auto_init_if_empty(lhs_transposed,
+ lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed));
// Assign lhs_to_use pointer to use transposed TensorInfo
lhs_to_use = &lhs_transposed;
}
- if(adj_rhs)
+ if (adj_rhs)
{
- auto_init_if_empty(rhs_transposed, rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
+ auto_init_if_empty(rhs_transposed,
+ rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed));
// Assign rhs_to_use pointer to use transposed TensorInfo
rhs_to_use = &rhs_transposed;
}
ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1),
- "The product AB is defined only if the number of columns in A is equal to the number of rows in B (after transpose)");
+ "The product AB is defined only if the number of columns in A is equal to the "
+ "number of rows in B (after transpose)");
// Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors
- for(unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
+ for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), "Broadcasting in Batch dimension is unsupported by this operator.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i),
+ "Broadcasting in Batch dimension is unsupported by this operator.");
}
// Quantized-specific configuration
- if(is_data_type_quantized(lhs->data_type()))
+ if (is_data_type_quantized(lhs->data_type()))
{
- ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, gemm_info.activation_info, gemm_info.output_stage));
+ ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst,
+ gemm_info.activation_info, gemm_info.output_stage));
}
cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info);
@@ -138,7 +162,12 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
return Status{};
}
-void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CpuMatMul::configure(ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
@@ -163,21 +192,23 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
_original_rhs_shape = rhs_to_use.tensor_shape();
// Reshape lhs for use with assembly kernels.
- lhs_to_use.set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
- dst_to_use.set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
+ lhs_to_use.set_tensor_shape(
+ TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
+ dst_to_use.set_tensor_shape(
+ TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2));
// 2. Configuration for transpose of lhs/rhs
// ------------------------------------------------------
// Initialise transposed TensorInfo class for aux tensors (intermediary tensors)
- if(_adj_lhs)
+ if (_adj_lhs)
{
// Setup transpose LHS
_transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
_transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed);
}
- if(_adj_rhs)
+ if (_adj_rhs)
{
// Setup transpose RHS
_transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
@@ -196,20 +227,22 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use;
// Quantized-specific configuration
- if(is_data_type_quantized(lhs->data_type()))
+ if (is_data_type_quantized(lhs->data_type()))
{
- get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, _gemm_info.output_stage);
+ get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info,
+ _gemm_info.output_stage);
}
// Configure Asm Kernel
_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
- _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, _gemm_info); // c is nullptr as bias not supported in MatMul
+ _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use,
+ _gemm_info); // c is nullptr as bias not supported in MatMul
// Specify memory requirements for intermediate tensors
auto asm_mem_req = _asm_glue->workspace();
// Specify memory required by gemm kernel
int idx = 0;
- for(const auto &aux : asm_mem_req)
+ for (const auto &aux : asm_mem_req)
{
_aux_mem[idx] = aux;
idx++;
@@ -228,8 +261,12 @@ void CpuMatMul::run(ITensorPack &tensors)
// Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm)
// Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly)
- lhs->info()->set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
- dst->info()->set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+ lhs->info()->set_tensor_shape(
+ TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1,
+ _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+ dst->info()->set_tensor_shape(
+ TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1,
+ _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2));
// Initialise object to handle stored transposed tensors in auxillary memory
@@ -240,17 +277,19 @@ void CpuMatMul::run(ITensorPack &tensors)
ITensorPack asm_tensors(tensors);
// Run transpose lhs if necessary
- if(_adj_lhs)
+ if (_adj_lhs)
{
- ITensorPack lhs_transpose_pack = { { TensorType::ACL_SRC, lhs }, { TensorType::ACL_DST, lhs_transposed.get() } };
- NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), lhs_transpose_pack);
+ ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}};
+ NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(),
+ lhs_transpose_pack);
asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get());
}
// Run transpose rhs if necessary
- if(_adj_rhs)
+ if (_adj_rhs)
{
- ITensorPack rhs_transpose_pack = { { TensorType::ACL_SRC, rhs }, { TensorType::ACL_DST, rhs_transposed.get() } };
- NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), rhs_transpose_pack);
+ ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}};
+ NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(),
+ rhs_transpose_pack);
asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get());
}
// Run asm kernel
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 475c019fd0..24db3da346 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -25,6 +25,7 @@
#define ACL_SRC_CPU_OPERATORS_CPUMATMUL
#include "arm_compute/core/TensorInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
#include "src/cpu/kernels/CpuTransposeKernel.h"
@@ -66,18 +67,27 @@ public:
* @param[in] settings The settings for matmul operation (i.e fast math)
* @param[in] act_info Class containing information about fused activation function.
*/
- void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuMatMul::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+ static Status validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
@@ -91,9 +101,9 @@ private:
};
// Define unique pointers to kernels/operators used by matmul
- std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{ nullptr };
- std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{ nullptr };
- std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{ nullptr };
+ std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr};
+ std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr};
+ std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr};
// TensorInfo for tensors stored in auxillary memory
TensorInfo _lhs_transposed{};
@@ -105,13 +115,13 @@ private:
TensorShape _original_dst_shape{};
// Note : adj_lhs means the same as transposing lhs
- bool _adj_lhs{ false };
- bool _adj_rhs{ false };
- bool _fast_math{ false };
+ bool _adj_lhs{false};
+ bool _adj_rhs{false};
+ bool _fast_math{false};
AsmGemmInfo _gemm_info{};
- experimental::MemoryRequirements _aux_mem{ Count };
+ experimental::MemoryRequirements _aux_mem{Count};
};
-}
-}
+} // namespace cpu
+} // namespace arm_compute
#endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */
diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp
index 24e9fd6d46..697fc40ab3 100644
--- a/src/cpu/operators/CpuMaxUnpooling.cpp
+++ b/src/cpu/operators/CpuMaxUnpooling.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuMaxUnpooling.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
@@ -29,7 +30,10 @@ namespace arm_compute
{
namespace cpu
{
-void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+void CpuMaxUnpooling::configure(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info);
auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>();
@@ -37,9 +41,12 @@ void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indic
_kernel = std::move(k);
}
-Status CpuMaxUnpooling::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status CpuMaxUnpooling::validate(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info)
{
return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info);
}
-} // namesapce cpu
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h
index aa1f1072a5..5dc00bce9e 100644
--- a/src/cpu/operators/CpuMaxUnpooling.h
+++ b/src/cpu/operators/CpuMaxUnpooling.h
@@ -44,14 +44,18 @@ public:
* @param[out] dst Destination tensor. Data types supported: Same as @p src
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
- void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+ void
+ configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuMaxUnpooling::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info);
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp
index 4c15015206..ac9847111d 100644
--- a/src/cpu/operators/CpuMul.cpp
+++ b/src/cpu/operators/CpuMul.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuMulKernel.h"
@@ -33,14 +34,24 @@ namespace arm_compute
{
namespace cpu
{
-Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status CpuMul::validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy);
}
-void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void CpuMul::configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
@@ -58,13 +69,19 @@ void CpuMul::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
}
-Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status CpuComplexMul::validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return kernels::CpuComplexMulKernel::validate(src1, src2, dst);
}
-void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void CpuComplexMul::configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
@@ -80,4 +97,4 @@ void CpuComplexMul::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h
index 3e0edbf050..82b309830b 100644
--- a/src/cpu/operators/CpuMul.h
+++ b/src/cpu/operators/CpuMul.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
@@ -61,7 +62,12 @@ public:
* @param[in] rounding_policy Rounding policy.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ void configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
@@ -69,7 +75,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ static Status validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
@@ -89,14 +100,20 @@ public:
* @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuComplexMul::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp
index babaf21b6f..25acc92d00 100644
--- a/src/cpu/operators/CpuPermute.cpp
+++ b/src/cpu/operators/CpuPermute.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuPermute.h"
-#include "src/cpu/kernels/CpuPermuteKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPermuteKernel.h"
namespace arm_compute
{
@@ -43,5 +42,5 @@ Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, cons
{
return kernels::CpuPermuteKernel::validate(src, dst, perm);
}
-} // namesapce cpu
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
index 722cd36ee5..b72bde6978 100644
--- a/src/cpu/operators/CpuPool2d.cpp
+++ b/src/cpu/operators/CpuPool2d.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuPool2dKernel.h"
#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
@@ -53,7 +54,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices);
// Check if we can run assembly kernels. Currently, indices are not supported by those kernels
- const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+ const bool run_optimised =
+ bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
// Get data layout
_data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
@@ -61,10 +63,11 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
// Check if we have Global Pooling Layer
const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
- _use_kernel_indices = pool_info.use_kernel_indices;
+ _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) &&
+ (src->dimension(idx_height) == pool_info.pool_size.height);
+ _use_kernel_indices = pool_info.use_kernel_indices;
- if(run_optimised)
+ if (run_optimised)
{
const CPUInfo &ci = NEScheduler::get().cpu_info();
const unsigned int num_threads = NEScheduler::get().num_threads();
@@ -76,7 +79,7 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
// Get kernel's memory requirements
constexpr size_t alignment = 4096;
const size_t workspace_size = pooling_wrapper->get_working_size(num_threads);
- _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
+ _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
_asm_glue = std::move(pooling_wrapper);
}
@@ -89,11 +92,15 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
}
}
-Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2d::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
{
- const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+ const bool run_optimised =
+ bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
- if(run_optimised)
+ if (run_optimised)
{
return Status{};
}
@@ -105,20 +112,24 @@ void CpuPool2d::run(ITensorPack &tensors)
{
ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
- if(_asm_glue)
+ if (_asm_glue)
{
const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
}
else
{
- switch(_data_layout)
+ switch (_data_layout)
{
case DataLayout::NCHW:
- NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+ NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+ _is_global_pooling_layer ? Window::DimZ : Window::DimY,
+ _pooling_layer_kernel->window(), tensors);
break;
case DataLayout::NHWC:
- NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), (_use_kernel_indices ? Window::DimY : Window::DimX), _pooling_layer_kernel->window(), tensors);
+ NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+ (_use_kernel_indices ? Window::DimY : Window::DimX),
+ _pooling_layer_kernel->window(), tensors);
break;
default:
ARM_COMPUTE_ERROR("Data layout not supported");
diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h
index 5c571db88a..ea73e3f335 100644
--- a/src/cpu/operators/CpuPool2d.h
+++ b/src/cpu/operators/CpuPool2d.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_POOL2D_H
#include "arm_compute/core/experimental/Types.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -58,17 +59,21 @@ public:
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
* @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
*/
- void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+ void
+ configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuPool2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices = nullptr);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp
index 14e4ac6c97..7fa78c1f80 100644
--- a/src/cpu/operators/CpuPool3d.cpp
+++ b/src/cpu/operators/CpuPool3d.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/Scheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuPool3dKernel.h"
@@ -35,8 +36,7 @@ namespace arm_compute
{
namespace cpu
{
-CpuPool3d::CpuPool3d()
- : _aux_mem(1)
+CpuPool3d::CpuPool3d() : _aux_mem(1)
{
}
@@ -70,4 +70,4 @@ experimental::MemoryRequirements CpuPool3d::workspace() const
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
index 8a73f8a0af..235d798095 100644
--- a/src/cpu/operators/CpuPool3d.h
+++ b/src/cpu/operators/CpuPool3d.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_POOL3D_H
#include "arm_compute/core/experimental/Types.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -61,7 +62,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp
index f9e14d1f88..4315499c39 100644
--- a/src/cpu/operators/CpuQuantize.cpp
+++ b/src/cpu/operators/CpuQuantize.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuQuantizeKernel.h"
diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
index e6892a2e7e..a423abb49a 100644
--- a/src/cpu/operators/CpuReshape.cpp
+++ b/src/cpu/operators/CpuReshape.cpp
@@ -23,11 +23,10 @@
*/
#include "src/cpu/operators/CpuReshape.h"
-#include "src/cpu/kernels/CpuReshapeKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuReshapeKernel.h"
namespace arm_compute
{
@@ -49,7 +48,7 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
void CpuReshape::run(ITensorPack &tensors)
{
ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- if(!_is_prepared)
+ if (!_is_prepared)
{
static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors);
_is_prepared = true;
diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
index 9bc43e7db4..33da792319 100644
--- a/src/cpu/operators/CpuReshape.h
+++ b/src/cpu/operators/CpuReshape.h
@@ -24,9 +24,10 @@
#ifndef ARM_COMPUTE_CPU_RESHAPE_H
#define ARM_COMPUTE_CPU_RESHAPE_H
-#include "src/cpu/ICpuOperator.h"
#include "arm_compute/core/Window.h"
+#include "src/cpu/ICpuOperator.h"
+
namespace arm_compute
{
namespace cpu
@@ -53,7 +54,7 @@ public:
void run(ITensorPack &tensors) override;
private:
- bool _is_prepared{ false } ;
+ bool _is_prepared{false};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp
index 8a712bf088..7df9296931 100644
--- a/src/cpu/operators/CpuScale.cpp
+++ b/src/cpu/operators/CpuScale.cpp
@@ -24,8 +24,9 @@
#include "src/cpu/operators/CpuScale.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/utils/ScaleUtils.h"
#include "src/cpu/kernels/CpuScaleKernel.h"
@@ -37,11 +38,12 @@ namespace cpu
{
namespace
{
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
+void precompute_dx_dy_offsets(
+ ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
{
ARM_COMPUTE_ERROR_ON(offsets == nullptr);
float sampling_offset = 0.0f;
- if(sampling_policy == SamplingPolicy::CENTER)
+ if (sampling_policy == SamplingPolicy::CENTER)
{
sampling_offset = 0.5f;
}
@@ -50,38 +52,44 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float
win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
- if(dx != nullptr && dy != nullptr)
+ if (dx != nullptr && dy != nullptr)
{
// Pre-compute the offset and pixel's distance for BILINEAR interpolation
Iterator offsets_it(offsets, win);
Iterator dx_it(dx, win);
Iterator dy_it(dy, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const float in_x = (id.x() + sampling_offset) * wr - sampling_offset;
- const float in_y = (id.y() + sampling_offset) * hr - sampling_offset;
- const int in_xi = std::floor(in_x);
- const int in_yi = std::floor(in_y);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
+ {
+ const float in_x = (id.x() + sampling_offset) * wr - sampling_offset;
+ const float in_y = (id.y() + sampling_offset) * hr - sampling_offset;
+ const int in_xi = std::floor(in_x);
+ const int in_yi = std::floor(in_y);
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
- *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi;
- *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi;
- },
- offsets_it, dx_it, dy_it);
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+ *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi;
+ *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi;
+ },
+ offsets_it, dx_it, dy_it);
}
else
{
// Pre-compute the offset for NEAREST interpolation
Iterator offsets_it(offsets, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const float float_in_xi = (id.x() + sampling_offset) * wr;
- const auto in_xi = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
- },
- offsets_it);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
+ {
+ const float float_in_xi = (id.x() + sampling_offset) * wr;
+ const auto in_xi = static_cast<size_t>(
+ align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi)
+ : std::floor(float_in_xi));
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+ },
+ offsets_it);
}
}
} // namespace
@@ -96,20 +104,24 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
_is_prepared = false;
// Get data layout and width/height indices
- _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
// Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+ const bool is_align_corners_used =
+ _scale_info.align_corners &&
+ arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+ dst->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+ dst->dimension(idx_height), is_align_corners_used);
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
- && hr <= 1.f) ?
- InterpolationPolicy::NEAREST_NEIGHBOR :
- _scale_info.interpolation_policy;
+ InterpolationPolicy policy_to_use =
+ (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ ? InterpolationPolicy::NEAREST_NEIGHBOR
+ : _scale_info.interpolation_policy;
// Get the tensor shape
TensorShape shape(dst->dimension(idx_width));
@@ -122,7 +134,7 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
auto dy = std::make_unique<TensorInfo>(tensor_info_dxdy);
auto offsets = std::make_unique<TensorInfo>(tensor_info_offsets);
auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>();
- switch(policy_to_use)
+ switch (policy_to_use)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
{
@@ -148,7 +160,8 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+ info.sampling_policy != SamplingPolicy::TOP_LEFT);
ITensorInfo *offsets = nullptr;
ITensorInfo *dx = nullptr;
@@ -160,19 +173,25 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+ const bool is_align_corners_used =
+ info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+ dst->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+ dst->dimension(idx_height), is_align_corners_used);
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+ InterpolationPolicy policy_to_use =
+ (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ ? InterpolationPolicy::NEAREST_NEIGHBOR
+ : info.interpolation_policy;
// Get the tensor shape of auxilary buffers
const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height));
TensorInfo tensor_info_offsets(shape, Format::S32);
TensorInfo tensor_info_dx(shape, Format::F32);
TensorInfo tensor_info_dy(shape, Format::F32);
- switch(policy_to_use)
+ switch (policy_to_use)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
offsets = &tensor_info_offsets;
@@ -186,13 +205,14 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const
break;
}
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
return Status{};
}
void CpuScale::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_is_prepared = true;
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
@@ -206,22 +226,27 @@ void CpuScale::prepare(ITensorPack &tensors)
const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
// Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
+ const bool is_align_corners_used =
+ _scale_info.align_corners &&
+ arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+ src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+ src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
- && hr <= 1.f) ?
- InterpolationPolicy::NEAREST_NEIGHBOR :
- _scale_info.interpolation_policy;
+ InterpolationPolicy policy_to_use =
+ (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ ? InterpolationPolicy::NEAREST_NEIGHBOR
+ : _scale_info.interpolation_policy;
const SamplingPolicy sampling_policy = _scale_info.sampling_policy;
- bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(_data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
+ bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+ _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
- if(precompute_indices_weights)
+ if (precompute_indices_weights)
{
- switch(policy_to_use)
+ switch (policy_to_use)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
{
@@ -245,7 +270,8 @@ void CpuScale::prepare(ITensorPack &tensors)
}
else
{
- if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
+ if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR &&
+ policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
{
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h
index ee7c523bad..c12a8e733a 100644
--- a/src/cpu/operators/CpuScale.h
+++ b/src/cpu/operators/CpuScale.h
@@ -24,9 +24,10 @@
#ifndef ARM_COMPUTE_CPU_SCALE_H
#define ARM_COMPUTE_CPU_SCALE_H
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/experimental/Types.h"
+
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
@@ -62,9 +63,9 @@ public:
void run(ITensorPack &tensors) override;
private:
- ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED };
- DataLayout _data_layout{ DataLayout::UNKNOWN };
- bool _is_prepared{ false };
+ ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED};
+ DataLayout _data_layout{DataLayout::UNKNOWN};
+ bool _is_prepared{false};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index bf4c2fa3a2..e55d7f903e 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -25,9 +25,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/core/helpers/SoftmaxHelpers.h"
@@ -63,13 +64,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis);
- const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+ const unsigned int actual_axis =
+ static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
_needs_permute = actual_axis > 0;
- if(_needs_permute)
+ if (_needs_permute)
{
- _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+ _permute_input.configure(src, &_input_permuted,
+ softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
}
// We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
@@ -79,10 +82,11 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
// Create intermediate tensors shapes
TensorShape max_sum_shape = tmp_input->tensor_shape();
max_sum_shape.set(0, 1);
- const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
- DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
- TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
- TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+ const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+ DataType tmp_data_type =
+ is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
+ TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+ TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
// Init intermediate tensors
_max = TensorInfo(max_info);
@@ -94,13 +98,14 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
_max_kernel = std::move(mk);
auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
- if(_needs_permute)
+ if (_needs_permute)
{
// The normalization kernel stores the result in a permuted output tensor
sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
// Re-permute the permuted output into the requested (4D) output
- _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+ _permute_output.configure(&_output_permuted, dst,
+ softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
}
else
{
@@ -109,11 +114,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
}
_softmax_kernel = std::move(sm);
- _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
- _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+ _aux_mem[InternalTensorIdx::MAX] =
+ MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
+ _aux_mem[InternalTensorIdx::TMP] =
+ MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
- _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size());
- _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size());
+ _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
+ MemoryLifetime::Temporary, _input_permuted.total_size());
+ _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST),
+ MemoryLifetime::Temporary, _output_permuted.total_size());
}
template <bool IS_LOG>
@@ -123,7 +132,8 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
ARM_COMPUTE_UNUSED(beta);
- ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) ||
+ static_cast<int32_t>(src->num_dimensions()) <= axis);
// Create intermediate tensor info
DataType tmp_data_type = src->data_type();
@@ -131,25 +141,33 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
TensorShape max_sum_shape = src->tensor_shape();
max_sum_shape.set(0, 1);
- const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true));
+ const TensorInfo tensor_info_max_sum(src->clone()
+ ->set_tensor_shape(max_sum_shape)
+ .set_data_type(tmp_data_type)
+ .set_quantization_info(src->quantization_info())
+ .set_is_resizable(true));
const TensorInfo dont_care;
- const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+ const unsigned int actual_axis =
+ static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
const bool needs_permute = actual_axis > 0;
- if(needs_permute)
+ if (needs_permute)
{
- const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
- const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
- TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape));
+ const PermutationVector permutation_vector =
+ softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+ const TensorShape permuted_shape =
+ misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
+ TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector));
TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
}
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
+ &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
return Status{};
}
@@ -166,43 +184,38 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
- CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true);
+ CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors,
+ true);
ITensorPack max_pack;
ITensorPack softmax_pack;
- if(_needs_permute)
+ if (_needs_permute)
{
- ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } };
+ ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}};
_permute_input.run(permute_in_pack);
- max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } };
+ max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}};
- softmax_pack =
- {
- { TensorType::ACL_SRC_0, input_permuted.get() },
- { TensorType::ACL_SRC_1, max.get() },
- { TensorType::ACL_DST_0, output_permuted.get() },
- { TensorType::ACL_DST_1, tmp.get() }
- };
+ softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()},
+ {TensorType::ACL_SRC_1, max.get()},
+ {TensorType::ACL_DST_0, output_permuted.get()},
+ {TensorType::ACL_DST_1, tmp.get()}};
}
else
{
- max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } };
-
- softmax_pack =
- {
- { TensorType::ACL_SRC_0, src },
- { TensorType::ACL_SRC_1, max.get() },
- { TensorType::ACL_DST_0, dst },
- { TensorType::ACL_DST_1, tmp.get() }
- };
+ max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}};
+
+ softmax_pack = {{TensorType::ACL_SRC_0, src},
+ {TensorType::ACL_SRC_1, max.get()},
+ {TensorType::ACL_DST_0, dst},
+ {TensorType::ACL_DST_1, tmp.get()}};
}
NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
- if(_needs_permute)
+ if (_needs_permute)
{
ITensorPack permute_out_pack;
permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get());
@@ -211,7 +224,7 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
}
}
-template <bool IS_LOG>
+template <bool IS_LOG>
experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
{
return _aux_mem;
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 64df8704f9..8cab70e14f 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -24,11 +24,13 @@
#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
#define ARM_COMPUTE_CPU_SOFTMAX_H
-#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/TensorInfo.h"
+
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
#include "src/cpu/operators/CpuPermute.h"
+
#include <memory>
namespace arm_compute
@@ -77,7 +79,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp
index 91a5b6e63c..7d27efbc96 100644
--- a/src/cpu/operators/CpuSub.cpp
+++ b/src/cpu/operators/CpuSub.cpp
@@ -23,17 +23,20 @@
*/
#include "src/cpu/operators/CpuSub.h"
-#include "src/cpu/kernels/CpuSubKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuSubKernel.h"
namespace arm_compute
{
namespace cpu
{
-void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuSub::configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy);
@@ -42,7 +45,11 @@ void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor
_kernel = std::move(k);
}
-Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuSub::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return kernels::CpuSubKernel::validate(src0, src1, dst, policy);
diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h
index 88908637aa..d1782a1d3c 100644
--- a/src/cpu/operators/CpuSub.h
+++ b/src/cpu/operators/CpuSub.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_SUB_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
@@ -53,14 +54,22 @@ public:
* @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuSub::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp
index 4e7854fd6e..ea548e0511 100644
--- a/src/cpu/operators/CpuTranspose.cpp
+++ b/src/cpu/operators/CpuTranspose.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuTranspose.h"
-#include "src/cpu/kernels/CpuTransposeKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuTransposeKernel.h"
namespace arm_compute
{
@@ -43,5 +42,5 @@ Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
{
return kernels::CpuTransposeKernel::validate(src, dst);
}
-} // namesapce cpu
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp
index c4edd89964..9d07736c13 100644
--- a/src/cpu/operators/CpuWinogradConv2d.cpp
+++ b/src/cpu/operators/CpuWinogradConv2d.cpp
@@ -22,23 +22,25 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuWinogradConv2d.h"
+
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "src/core/NEON/kernels/assembly/winograd.hpp"
#include "src/core/NEON/kernels/convolution/common/tensor.hpp"
#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
#include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
#include "src/cpu/operators/CpuActivation.h"
#include "src/cpu/operators/CpuPermute.h"
#include "src/cpu/utils/CpuAuxTensorHandler.h"
@@ -56,21 +58,26 @@ namespace
inline Tensor4DShape internal_get_shape(const ITensorInfo *in)
{
const DataLayout data_layout = in->data_layout();
- const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
- const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
- const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
- const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
+ const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+ const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+ const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
- return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
+ return Tensor4DShape{in_batches, in_height, in_width, in_channels};
}
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info)
{
ARM_COMPUTE_UNUSED(dst, weights);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
- if(biases != nullptr)
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+ "Winograd layer only supports unit strides.");
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
@@ -80,43 +87,46 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
return Status{};
}
-bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math,
- arm_conv::winograd::WinogradImpl *winograd_impl, std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
+bool get_winograd_kernel_implementation(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ arm_conv::winograd::WinogradImpl *winograd_impl,
+ std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
{
arm_conv::winograd::WinogradConfig winograd_cfg;
arm_gemm::GemmConfig cfg;
const DataType data_type = src->data_type();
- Tensor4DShape in_shape{ internal_get_shape(src) };
- Tensor4DShape out_shape{ internal_get_shape(dst) };
- Tensor4DShape kernel_shape{ internal_get_shape(weights) };
+ Tensor4DShape in_shape{internal_get_shape(src)};
+ Tensor4DShape out_shape{internal_get_shape(dst)};
+ Tensor4DShape kernel_shape{internal_get_shape(weights)};
uint32_t nthreads = NEScheduler::get().num_threads();
// Get configuration arguments for Winograd
winograd_cfg.output_rows = 0;
winograd_cfg.output_cols = 0;
conv_args = std::make_unique<arm_conv::ConvolutionArgs>(
- in_shape.n_batches,
- arm_conv::Shape2D{ static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols) },
- in_shape.n_channels,
- conv_info.pad_top(),
- conv_info.pad_left(),
- arm_conv::Shape2D{ static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols) },
- out_shape.n_channels,
- arm_conv::Shape2D{ static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols) },
- assembly_utils::map_to_arm_gemm_activation(act_info));
+ in_shape.n_batches,
+ arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)},
+ in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(),
+ arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)},
+ out_shape.n_channels,
+ arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)},
+ assembly_utils::map_to_arm_gemm_activation(act_info));
bool success = false;
- if(data_type == DataType::F32)
+ if (data_type == DataType::F32)
{
- success = arm_conv::winograd::get_implementation<float>(
- *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+ success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+ enable_fast_math, &winograd_cfg, nullptr);
}
#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- else if(data_type == DataType::F16)
+ else if (data_type == DataType::F16)
{
- success = arm_conv::winograd::get_implementation<__fp16>(
- *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+ success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+ enable_fast_math, &winograd_cfg, nullptr);
}
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
else
@@ -127,7 +137,8 @@ bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInf
}
inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
{
- return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+ return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
+ act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
}
} // namespace
@@ -141,7 +152,7 @@ CpuWinogradConv2d::CpuWinogradConv2d()
_permute_output(std::make_unique<CpuPermute>()),
_permute_weights(std::make_unique<CpuPermute>()),
_aux_mem(AuxTensorIdx::Count),
- _conv_args{ nullptr },
+ _conv_args{nullptr},
_winograd_impl{},
_data_layout(),
_winograd_transformed_input{},
@@ -152,15 +163,20 @@ CpuWinogradConv2d::CpuWinogradConv2d()
_weights_hwio(),
_input_nhwc(),
_output_nhwc(),
- _is_prepared{ false },
- _run_activation{ false }
+ _is_prepared{false},
+ _run_activation{false}
{
}
CpuWinogradConv2d::~CpuWinogradConv2d() = default;
-void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CpuWinogradConv2d::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
@@ -169,21 +185,29 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
const DataType data_type = src->data_type();
uint32_t nthreads = NEScheduler::get().num_threads();
_data_layout = src->data_layout();
- const Tensor4DShape kernel_shape{ internal_get_shape(weights) };
-
- bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &_winograd_impl, _conv_args);
-
- ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-
- const bool has_impl = ((_winograd_impl.input_transform != nullptr) && (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
- if(has_impl)
+ const Tensor4DShape kernel_shape{internal_get_shape(weights)};
+
+ bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+ &_winograd_impl, _conv_args);
+
+ ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+ kernel_shape.n_cols);
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+ _winograd_impl.input_transform->get_name().c_str());
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+ _winograd_impl.input_transform->get_name().c_str());
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+ _winograd_impl.input_transform->get_name().c_str());
+
+ const bool has_impl = ((_winograd_impl.input_transform != nullptr) &&
+ (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
+ if (has_impl)
{
// Determine how much working space is required, allocate it.
- const size_t input_workspace_size = _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
- const size_t output_workspace_size = _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
+ const size_t input_workspace_size =
+ _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
+ const size_t output_workspace_size =
+ _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
@@ -232,7 +256,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
// Configure the kernel to transform the input tensor from NCHW -> NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
_permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U));
weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
@@ -242,28 +266,30 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
_permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
// Reorder the convoluted output to ACL's ordering NCHW
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
// configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
- TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
- dst->dimension(1), dst->dimension(3)),
- 1, dst->data_type());
+ TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1,
+ dst->data_type());
_output_nhwc = info;
_permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U));
}
// Configure input transform kernel
- _transform_input_kernel = std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
+ _transform_input_kernel =
+ std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
// Configure GEMM function
- _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, &_winograd_transformed_output, 1.0f, 0.f);
+ _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr,
+ &_winograd_transformed_output, 1.0f, 0.f);
// Configure output transform kernel
- _transform_output_kernel = std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
+ _transform_output_kernel =
+ std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
//Configure Activation Layer
_run_activation = act_info.enabled() && !fuse_function_supported(act_info);
- if(_run_activation)
+ if (_run_activation)
{
_activation_func->configure(dst, nullptr, act_info);
}
@@ -276,40 +302,55 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
_aux_mem[TempResult] = asm_mem_req[TempResult];
// Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
- _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, wds.input_matrix_size_bytes, storage_alignment);
- _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, wds.output_matrix_size_bytes, storage_alignment);
- _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size));
- _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
- _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, wds.weight_matrix_size_bytes, storage_alignment);
- if(_data_layout == DataLayout::NCHW)
+ _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary,
+ wds.input_matrix_size_bytes, storage_alignment);
+ _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary,
+ wds.output_matrix_size_bytes, storage_alignment);
+ _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary,
+ std::max(input_workspace_size, output_workspace_size));
+ _aux_mem[PermutedWeights] =
+ MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
+ _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent,
+ wds.weight_matrix_size_bytes, storage_alignment);
+ if (_data_layout == DataLayout::NCHW)
{
_aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size());
_aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size());
}
}
}
-Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CpuWinogradConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
// Disable winograd for fp16 if fast math is false.
- if(!enable_fast_math)
+ if (!enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
}
- const Tensor4DShape kernel_shape{ internal_get_shape(weights) };
+ const Tensor4DShape kernel_shape{internal_get_shape(weights)};
arm_conv::winograd::WinogradImpl winograd_impl{};
std::unique_ptr<arm_conv::ConvolutionArgs> conv_args;
- const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &winograd_impl, conv_args);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", winograd_impl.input_transform->get_name().c_str());
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", winograd_impl.input_transform->get_name().c_str());
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", winograd_impl.input_transform->get_name().c_str());
+ const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+ &winograd_impl, conv_args);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+ kernel_shape.n_cols);
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+ winograd_impl.input_transform->get_name().c_str());
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+ winograd_impl.input_transform->get_name().c_str());
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+ winograd_impl.input_transform->get_name().c_str());
return Status{};
}
@@ -328,24 +369,29 @@ void CpuWinogradConv2d::run(ITensorPack &tensors)
// Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory.
CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true);
- CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, tensors, true);
+ CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input,
+ tensors, true);
CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true);
const bool is_nchw = _data_layout == DataLayout::NCHW;
- if(is_nchw)
+ if (is_nchw)
{
//Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
- ITensorPack pack{ { ACL_SRC, src }, { ACL_DST, input_nhwc.get() } };
+ ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}};
_permute_input->run(pack);
}
- CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, tensors, true);
+ CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output,
+ tensors, true);
CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true);
CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true);
- ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : src }, { ACL_DST, winograd_input_transformed.get() }, { ACL_INT, input_workspace.get() } };
+ ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src},
+ {ACL_DST, winograd_input_transformed.get()},
+ {ACL_INT, input_workspace.get()}};
NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack);
- CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, tensors, true);
+ CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights,
+ tensors, true);
// Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
ITensorPack gemm_pack = tensors;
@@ -356,30 +402,34 @@ void CpuWinogradConv2d::run(ITensorPack &tensors)
_gemm_function->run(gemm_pack);
// Output transform
- ITensorPack transform_output_pack{ { ACL_SRC_0, winograd_output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : output }, { ACL_SRC_1, biases }, { ACL_INT, output_workspace.get() } };
+ ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()},
+ {ACL_DST, is_nchw ? output_nhwc.get() : output},
+ {ACL_SRC_1, biases},
+ {ACL_INT, output_workspace.get()}};
NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack);
- if(is_nchw)
+ if (is_nchw)
{
// Reorder the convoluted output to ACL's ordering NCHW
- ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, output } };
+ ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}};
_permute_output->run(pack);
}
- if(_run_activation)
+ if (_run_activation)
{
- ITensorPack pack{ { ACL_SRC, output }, { ACL_DST, output } };
+ ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}};
_activation_func->run(pack);
}
}
void CpuWinogradConv2d::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
- const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
- ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+ const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *weights_aux =
+ utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux);
- ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+ ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
_permute_weights->run(permute_tensors);
const int element_size_in_bytes = permuted_weights.get()->info()->element_size();
// Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format.
@@ -387,31 +437,32 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors)
const unsigned int width_idx = 2; // W in HWIO
const unsigned int channel_idx = 1; // I in HWIO
- const int permuted_weight_row_stride = permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
- const int permuted_weight_col_stride = permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
- const int permuted_weight_channel_stride = permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
+ const int permuted_weight_row_stride =
+ permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
+ const int permuted_weight_col_stride =
+ permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
+ const int permuted_weight_channel_stride =
+ permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
// Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory.
- ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
+ ITensor *weights_transf =
+ utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf);
CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf);
const void *permuted_weights_ptr;
void *win_wght_transf_ptr;
- permuted_weights_ptr = reinterpret_cast<const void *>(permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
- win_wght_transf_ptr = reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
+ permuted_weights_ptr = reinterpret_cast<const void *>(
+ permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
+ win_wght_transf_ptr =
+ reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() +
+ winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
// Prepare Weights
_winograd_impl.weight_transform->execute(
- *_conv_args,
- permuted_weights_ptr,
- permuted_weight_row_stride,
- permuted_weight_col_stride,
- permuted_weight_channel_stride,
- win_wght_transf_ptr,
- _winograd_impl.winograd_spec,
- 0, 1 // Thread 1 of 1
+ *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride,
+ permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1
);
ITensorPack gemm_pack = tensors;
gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get());
diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h
index e0df34e2db..7e1d952462 100644
--- a/src/cpu/operators/CpuWinogradConv2d.h
+++ b/src/cpu/operators/CpuWinogradConv2d.h
@@ -26,10 +26,11 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
#include "src/cpu/kernels/assembly/gemm_common.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
#include "src/cpu/operators/CpuActivation.h"
#include "src/cpu/operators/CpuGemm.h"
#include "src/cpu/operators/CpuPermute.h"
@@ -73,7 +74,11 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info = ActivationLayerInfo(),
bool enable_fast_math = false);
/** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d
@@ -82,13 +87,17 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info = ActivationLayerInfo(),
bool enable_fast_math = false);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
private:
@@ -108,27 +117,28 @@ private:
PermutedOutput = TransformedInput,
Count = 10
};
- std::unique_ptr<CpuGemm> _gemm_function;
- std::unique_ptr<CpuActivation> _activation_func;
- std::unique_ptr<ICPPKernel> _transform_input_kernel;
- std::unique_ptr<ICPPKernel> _transform_output_kernel;
- std::unique_ptr<CpuPermute> _permute_input;
- std::unique_ptr<CpuPermute> _permute_output;
- std::unique_ptr<CpuPermute> _permute_weights;
- experimental::MemoryRequirements _aux_mem{ Count };
- std::unique_ptr<arm_conv::ConvolutionArgs> _conv_args; // Make it unique ptr because this type does not have a default constructor
- arm_conv::winograd::WinogradImpl _winograd_impl;
- DataLayout _data_layout;
- TensorInfo _winograd_transformed_input;
- TensorInfo _winograd_transformed_output;
- TensorInfo _winograd_transformed_weights;
- TensorInfo _input_workspace;
- TensorInfo _output_workspace;
- TensorInfo _weights_hwio;
- TensorInfo _input_nhwc;
- TensorInfo _output_nhwc;
- bool _is_prepared;
- bool _run_activation;
+ std::unique_ptr<CpuGemm> _gemm_function;
+ std::unique_ptr<CpuActivation> _activation_func;
+ std::unique_ptr<ICPPKernel> _transform_input_kernel;
+ std::unique_ptr<ICPPKernel> _transform_output_kernel;
+ std::unique_ptr<CpuPermute> _permute_input;
+ std::unique_ptr<CpuPermute> _permute_output;
+ std::unique_ptr<CpuPermute> _permute_weights;
+ experimental::MemoryRequirements _aux_mem{Count};
+ std::unique_ptr<arm_conv::ConvolutionArgs>
+ _conv_args; // Make it unique ptr because this type does not have a default constructor
+ arm_conv::winograd::WinogradImpl _winograd_impl;
+ DataLayout _data_layout;
+ TensorInfo _winograd_transformed_input;
+ TensorInfo _winograd_transformed_output;
+ TensorInfo _winograd_transformed_weights;
+ TensorInfo _input_workspace;
+ TensorInfo _output_workspace;
+ TensorInfo _weights_hwio;
+ TensorInfo _input_nhwc;
+ TensorInfo _output_nhwc;
+ bool _is_prepared;
+ bool _run_activation;
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 3069d6b541..343ef21c0b 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -24,12 +24,13 @@
#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
#include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
#include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
#include "src/cpu/utils/CpuAuxTensorHandler.h"
#include <arm_neon.h>
@@ -53,7 +54,12 @@ namespace
* @param[in] num_threads Number of threads to run this method. Must be >= 1
*/
template <typename TypeInput, typename TypeOutput>
-void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads)
+void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm,
+ ITensor *dst,
+ const TypeInput *src,
+ int src_ld,
+ int src_multi_stride,
+ unsigned int num_threads)
{
ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr);
ARM_COMPUTE_ERROR_ON(num_threads == 0);
@@ -61,14 +67,14 @@ void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutpu
const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size();
std::vector<IScheduler::Workload> workloads(num_threads);
- for(unsigned int t = 0; t < num_threads; ++t)
+ for (unsigned int t = 0; t < num_threads; ++t)
{
- workloads[t] = [ = ](const ThreadInfo & info)
+ workloads[t] = [=](const ThreadInfo &info)
{
const unsigned int start = (info.thread_id * wsize) / num_threads;
const unsigned int end = ((info.thread_id + 1) * wsize) / num_threads;
- if(start < end)
+ if (start < end)
{
gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end);
}
@@ -113,7 +119,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen
p.sections = 1;
p.indirect = false;
- if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+ if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
{
p.indirect = true;
p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
@@ -125,7 +131,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen
}
// Update M in case of GEMM3D for output
- if(info.depth_output_gemm3d != 0)
+ if (info.depth_output_gemm3d != 0)
{
p.M = d->tensor_shape().y() * d->tensor_shape().z();
p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
@@ -139,19 +145,24 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp
// Schedule assembly kernel
const int granule_threshold = 200;
IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
- if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+ if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
{
scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
}
- else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
+ else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D &&
+ (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 ||
+ data_type == DataType::S8))
{
//GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
- scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+ scheduling_hint =
+ IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
}
- else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+ else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D &&
+ (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
{
//special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
- scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+ scheduling_hint =
+ IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
}
return scheduling_hint;
@@ -175,8 +186,12 @@ public:
* @param[in] gemm_info GEMM meta-data
* @param[in] os Output stage meta-data.
*/
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+ void configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::GemmArgs args,
+ const AsmGemmInfo &gemm_info,
const OutputStage &os = {});
/** Set requantization shifts to be used
@@ -193,19 +208,20 @@ public:
*
* @return A tuple with the pointers to the shift and multiplier data respectively
*/
- std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
- const std::vector<int32_t> &multipliers);
+ std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+ set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
bool is_configured() const override;
experimental::MemoryRequirements workspace() const override;
bool isVarWeightsKernel() const override
{
- if(!_gemm_kernel_asm)
+ if (!_gemm_kernel_asm)
return false;
- const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
+ const arm_compute::WeightFormat wf =
+ assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY;
}
@@ -229,15 +245,15 @@ private:
void prepare_indirect_buffer(ITensorPack &tensors);
/** Assembly Gemm kernel */
- std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+ std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr};
/** Optimised Arm® Neon™ kernel */
- std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
+ std::unique_ptr<INEKernel> _optimised_kernel{nullptr};
/** Assembly GEMM workspace tensor info */
TensorInfo _workspace_info{};
/** Pre-transpose tensor info */
TensorInfo _pretranspose_info{};
/** Prepared flag */
- bool _is_prepared{ false };
+ bool _is_prepared{false};
/** GEMM meta-data */
AsmGemmInfo _gemm_info{};
/** GEMM kernel description */
@@ -251,26 +267,27 @@ private:
/** Indirect buffer */
std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
- std::vector<TypeInput> _indirect_pad{};
- arm_gemm::ConvolutionParameters _cp{};
- experimental::MemoryRequirements _aux_mem{ Count };
- bool _B_pretranspose_required{ false };
- bool _is_b_constant{ true };
- bool _is_c_constant{ true };
+ std::vector<TypeInput> _indirect_pad{};
+ arm_gemm::ConvolutionParameters _cp{};
+ experimental::MemoryRequirements _aux_mem{Count};
+ bool _B_pretranspose_required{false};
+ bool _is_b_constant{true};
+ bool _is_c_constant{true};
};
template <typename TypeInput, typename TypeOutput, class OutputStage>
std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
-Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
+ const std::vector<int32_t> &multipliers)
{
_multipliers = multipliers;
_shifts = shifts;
bool need_left = false;
- for(const auto s : _shifts)
+ for (const auto s : _shifts)
{
left_shifts.push_back(std::max(-s, int32_t(0)));
right_shifts.push_back(std::min(-s, int32_t(0)));
- if(s < 0 && !need_left)
+ if (s < 0 && !need_left)
{
need_left = true;
}
@@ -295,32 +312,35 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens
const int multi_size = batch_size * batches;
const size_t multi_stride = multi_size / sizeof(TypeInput);
- for(int64_t m = 0; m < multis; m++)
+ for (int64_t m = 0; m < multis; m++)
{
- for(int64_t b = 0; b < batches; b++)
+ for (int64_t b = 0; b < batches; b++)
{
- for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+ for (int64_t output_y = 0; output_y < _cp.output_height; output_y++)
{
- for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+ for (int64_t output_x = 0; output_x < _cp.output_width; output_x++)
{
int64_t output_xy = (output_y * _cp.output_width) + output_x;
- for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+ for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
{
- for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+ for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
{
int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
int64_t input_xy = (input_y * _cp.input_width) + input_x;
- if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+ if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
{
- _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
+ _indirect_buf
+ .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+ _indirect_pad.data();
}
else
{
- _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+ _indirect_buf
+ .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
}
}
@@ -332,12 +352,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens
}
template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *d,
+ const AsmGemmInfo &info)
{
ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
float zeropad = 0.f;
- if(is_data_type_quantized(a->data_type()))
+ if (is_data_type_quantized(a->data_type()))
{
zeropad = a->quantization_info().uniform().offset;
}
@@ -350,16 +373,25 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]);
const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]);
- _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
- info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
- };
-
- if(info.method == AsmConvMethod::Conv)
+ _cp = {input_width,
+ input_height,
+ input_channels,
+ kernel_width,
+ kernel_height,
+ output_width,
+ output_height,
+ info.ps_info.stride().first,
+ info.ps_info.stride().second,
+ info.padding_top,
+ info.padding_left,
+ zeropad};
+
+ if (info.method == AsmConvMethod::Conv)
{
_gemm_kernel_asm->set_convolution_parameters(_cp);
}
- if(info.method == AsmConvMethod::Indirect)
+ if (info.method == AsmConvMethod::Indirect)
{
const unsigned int multis = 1;
const unsigned int batches = a->tensor_shape().total_size_upper(3);
@@ -372,19 +404,22 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
const int multi_size = batch_size * batches;
const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
- _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
- _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+ _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
+ reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+ _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
+ reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
_indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
// Set indirect argument
int64_t pos = 0;
- for(int64_t m = 0; m < multis; m++)
+ for (int64_t m = 0; m < multis; m++)
{
- for(int64_t b = 0; b < batches; b++)
+ for (int64_t b = 0; b < batches; b++)
{
- for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+ for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
{
- (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+ (_indirect_arg.get())[pos++] =
+ _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
}
}
}
@@ -394,8 +429,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
}
template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::GemmArgs args,
+ const AsmGemmInfo &gemm_info,
const OutputStage &os)
{
ARM_COMPUTE_UNUSED(c);
@@ -404,7 +443,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
_is_c_constant = c ? c->are_values_constant() : true;
_gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
- if(_gemm_kernel_asm == nullptr)
+ if (_gemm_kernel_asm == nullptr)
{
//configuration not supported: Leave function unconfigured:
return;
@@ -419,13 +458,14 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
const size_t workspace_size = _gemm_kernel_asm->get_working_size();
const unsigned int alignment = 4096;
_workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
- _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
+ _aux_mem[AsmGemmWorkspace] =
+ MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
//if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
//the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
{
const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
- if(window_size < static_cast<unsigned int>(args._maxthreads))
+ if (window_size < static_cast<unsigned int>(args._maxthreads))
{
_gemm_kernel_asm->set_nthreads(window_size);
}
@@ -434,18 +474,19 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
_optimised_kernel = std::move(acl_gemm_wrapper);
_gemm_info = gemm_info;
// Check for pre-transposed support
- if(_gemm_kernel_asm->B_pretranspose_required())
+ if (_gemm_kernel_asm->B_pretranspose_required())
{
// Forcing 128-byte alignment (required by 32-bit kernels)
const unsigned int alignment = 128;
const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
_pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
- _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
- _B_pretranspose_required = true;
+ _aux_mem[Pretranspose] =
+ MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
+ _B_pretranspose_required = true;
}
// Handle indirect GEMM convolution
- if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+ if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
{
configure_indirect(a, b, d, gemm_info);
}
@@ -454,34 +495,39 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
template <typename TypeInput, typename TypeOutput, class OutputStage>
void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
// Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
- if(c && c->info()->data_type() == DataType::S32)
+ if (c && c->info()->data_type() == DataType::S32)
{
- _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+ _gemm_kernel_asm->set_quantized_bias(
+ reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
}
// Pretranspose B if required
- if(_gemm_kernel_asm->B_pretranspose_required())
+ if (_gemm_kernel_asm->B_pretranspose_required())
{
// Fixed format kernels need no pretranspose.
- ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
- const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
- const auto in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
- const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+ ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+ assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+ const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
+ const auto in1_ptr =
+ reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+ const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
- run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+ run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+ in1_ptr, ldb, multi_stride_b,
+ NEScheduler::get().num_threads());
b->mark_as_unused();
}
- if(_gemm_info.method == AsmConvMethod::Indirect)
+ if (_gemm_info.method == AsmConvMethod::Indirect)
{
prepare_indirect_buffer(tensors);
}
@@ -526,12 +572,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
int multi_stride_b = 0;
const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size();
- auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
+ auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
const TypeInput *in1_ptr = nullptr;
auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
// Check if B is pre-tranposed and de-reference if not
- if(!_gemm_kernel_asm->B_is_pretransposed())
+ if (!_gemm_kernel_asm->B_is_pretransposed())
{
ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
@@ -539,30 +585,34 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
}
// If necessary, run pretranspose every time if either weights or biases are non-constant
- if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
+ if ((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
{
- if(c && c->info()->data_type() == DataType::S32)
+ if (c && c->info()->data_type() == DataType::S32)
{
- _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+ _gemm_kernel_asm->set_quantized_bias(
+ reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
}
// Pretranspose B if required
- if(_B_pretranspose_required)
+ if (_B_pretranspose_required)
{
- const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
- const auto b_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
- const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+ const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
+ const auto b_ptr =
+ reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+ const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true);
ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
- if(_is_b_constant)
+ if (_is_b_constant)
{
_gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
}
else
{
- run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+ run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+ b_ptr, ldb, multi_stride_b,
+ NEScheduler::get().num_threads());
}
}
}
@@ -571,17 +621,17 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
// Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
- if(workspace.get()->buffer() != nullptr)
+ if (workspace.get()->buffer() != nullptr)
{
_gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
const unsigned int split_dim = scheduling_hint.split_dimension();
const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
unsigned int num_threads = NEScheduler::get().num_threads();
- if(window_size < num_threads)
+ if (window_size < num_threads)
{
num_threads = window_size;
}
- if(split_dim != IScheduler::split_dimensions_all)
+ if (split_dim != IScheduler::split_dimensions_all)
{
// Make sure the kernel does not expect more threads than we can actually spawn
const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
@@ -595,12 +645,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
// Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
TypeOutput *bias = nullptr;
- if(c && c->info()->data_type() != DataType::S32)
+ if (c && c->info()->data_type() != DataType::S32)
{
bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
}
- if(_gemm_info.method == AsmConvMethod::Indirect)
+ if (_gemm_info.method == AsmConvMethod::Indirect)
{
in0_ptr = nullptr;
lda = 0;
@@ -609,18 +659,20 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
}
// Set gemm parameters
- _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
- in1_ptr, ldb, multi_stride_b,
- out_ptr, ldd, batch_stride_d, multi_stride_d,
- bias, 0);
+ _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr,
+ ldd, batch_stride_d, multi_stride_d, bias, 0);
// Schedule
NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
}
template <typename TypeInput, typename TypeOutput>
void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
- const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::Activation activation, const AsmGemmInfo &info)
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::Activation activation,
+ const AsmGemmInfo &info)
{
Params p = extract_parameters(a, b, d, info);
const CPUInfo &ci = NEScheduler::get().cpu_info();
@@ -628,7 +680,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+ info.fixed_format, info.fast_mode, &cfg);
// Create arm_gemm fallback
auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
@@ -638,8 +691,12 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
template <typename TypeInput, typename TypeOutput>
void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
- const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::Activation activation, const AsmGemmInfo &info)
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::Activation activation,
+ const AsmGemmInfo &info)
{
ARM_COMPUTE_UNUSED(activation);
Params p = extract_parameters(a, b, d, info);
@@ -648,7 +705,8 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+ info.fixed_format, info.fast_mode, &cfg);
// Create arm_gemm fallback
auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
@@ -660,22 +718,20 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
const GEMMLowpOutputStageInfo os_info = info.output_stage;
arm_gemm::Requantize32 gemm_requant_info{};
- if(os_info.gemmlowp_shifts.size() > 1)
+ if (os_info.gemmlowp_shifts.size() > 1)
{
- const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
- gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
- a_offset, b_offset, os_info.gemmlowp_offset,
- (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
- std::get<2>(requantize_data),
- std::get<3>(requantize_data),
- os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+ const auto requantize_data =
+ fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
+ gemm_requant_info = arm_gemm::Requantize32(
+ nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset,
+ (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data),
+ std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
}
else
{
- gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
- a_offset, b_offset, os_info.gemmlowp_offset,
- -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
- os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+ gemm_requant_info =
+ arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift,
+ os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
}
// Configure fallback
@@ -684,13 +740,16 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
}
} //namespace
-CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch()
- : _arm_gemm(nullptr)
+CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr)
{
}
-Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
- const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const AsmGemmInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
ARM_COMPUTE_UNUSED(c);
@@ -701,53 +760,61 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, &cfg);
- switch(a->data_type())
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads,
+ info.fixed_format, info.fast_mode, &cfg);
+ switch (a->data_type())
{
case DataType::F32:
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for F32 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for F32 input");
break;
#ifdef __aarch64__
case DataType::U8:
case DataType::QASYMM8:
- if(d->data_type() == DataType::S32)
+ if (d->data_type() == DataType::S32)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for U8 input and U8 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for U8 input and U8 output");
}
break;
case DataType::S8:
case DataType::QASYMM8_SIGNED:
- if(d->data_type() == DataType::S32)
+ if (d->data_type() == DataType::S32)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for S8 input and S8 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for S8 input and S8 output");
}
break;
#endif /* __aarch64__ */
#if defined(ARM_COMPUTE_ENABLE_BF16)
case DataType::BFLOAT16:
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for BFLOAT16 input and F32 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for BFLOAT16 input and F32 output");
break;
}
#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for F16 input and F16 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for F16 input and F16 output");
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
@@ -759,26 +826,30 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
return Status{};
}
-Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::validate(
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
{
ARM_COMPUTE_UNUSED(c, info);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run),
+ "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
#ifndef __aarch64__
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
#endif /* __aarch64__ */
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
- DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
- DataType::BFLOAT16, DataType::F16, DataType::F32);
- if(is_data_type_quantized_per_channel(b->data_type()))
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+ DataType::BFLOAT16, DataType::F16, DataType::F32);
+ if (is_data_type_quantized_per_channel(b->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
}
- else if(is_fixed_format_fast_math(info.weight_format))
+ else if (is_fixed_format_fast_math(info.weight_format))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
@@ -787,22 +858,29 @@ Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32,
+ "Only F32 output supported for F32 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16,
+ "Only F16 output supported for F16 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32,
+ "Only F32 output supported for BFLOAT16 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32,
+ "Only U32 output supported for U8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32,
+ "Only S32 output supported for S8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 &&
+ (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
"Only QASYMM8/S32 output supported for QASYMM8 input");
arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED;
const Status ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info);
- if((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
+ if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
{
// Correctness check: if the format expected by the kernel is
// not "any", make sure that the one found matches the format
// intended by the caller.
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((expected_weight_format != info.weight_format),
- "The format expected by the kernel does not correspond with the one requested by the user.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (expected_weight_format != info.weight_format),
+ "The format expected by the kernel does not correspond with the one requested by the user.");
}
return ret;
}
@@ -813,18 +891,19 @@ bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo
return act.type != arm_gemm::Activation::Type::None;
}
-void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
+void CpuGemmAssemblyDispatch::configure(
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
//If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
- if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
+ if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
{
return;
}
- switch(a->data_type())
+ switch (a->data_type())
{
case DataType::F32:
create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
@@ -832,7 +911,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo
#ifdef __aarch64__
case DataType::U8:
case DataType::QASYMM8:
- if(d->data_type() == DataType::S32)
+ if (d->data_type() == DataType::S32)
{
create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
}
@@ -843,7 +922,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo
break;
case DataType::S8:
case DataType::QASYMM8_SIGNED:
- if(d->data_type() == DataType::S32)
+ if (d->data_type() == DataType::S32)
{
create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
}
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index ceb7a3f775..5be39a54c0 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -42,20 +43,20 @@ enum class AsmConvMethod
struct AsmGemmInfo
{
- AsmConvMethod method{ AsmConvMethod::Im2Col };
+ AsmConvMethod method{AsmConvMethod::Im2Col};
PadStrideInfo ps_info{};
ActivationLayerInfo activation_info{};
GEMMLowpOutputStageInfo output_stage{};
- bool negated_offsets{ true };
- bool reinterpret_input_as_3d{ false };
- bool depth_output_gemm3d{ false };
- int64_t padding_top{ 0 };
- int64_t padding_left{ 0 };
- float padding_value{ 0.f };
- bool fast_mode{ false };
- bool fixed_format{ false };
- arm_compute::WeightFormat weight_format{ arm_compute::WeightFormat::UNSPECIFIED };
- bool reshape_b_only_on_first_run{ true };
+ bool negated_offsets{true};
+ bool reinterpret_input_as_3d{false};
+ bool depth_output_gemm3d{false};
+ int64_t padding_top{0};
+ int64_t padding_left{0};
+ float padding_value{0.f};
+ bool fast_mode{false};
+ bool fixed_format{false};
+ arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED};
+ bool reshape_b_only_on_first_run{true};
};
/** Assembly kernel glue */
@@ -72,12 +73,12 @@ public:
class IFallback
{
public:
- virtual void run(ITensorPack &tensors) = 0;
- virtual void prepare(ITensorPack &tensors) = 0;
- virtual experimental::MemoryRequirements workspace() const = 0;
- virtual bool is_configured() const = 0;
- virtual bool isVarWeightsKernel() const = 0;
- virtual ~IFallback() = default;
+ virtual void run(ITensorPack &tensors) = 0;
+ virtual void prepare(ITensorPack &tensors) = 0;
+ virtual experimental::MemoryRequirements workspace() const = 0;
+ virtual bool is_configured() const = 0;
+ virtual bool isVarWeightsKernel() const = 0;
+ virtual ~IFallback() = default;
};
public:
@@ -121,7 +122,8 @@ public:
* @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
* @param[in] info GEMM meta-data
*/
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
+ void configure(
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
/** Indicates whether or not this function can be used to process the given parameters.
*
@@ -133,7 +135,11 @@ public:
*
* @return a status.
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const AsmGemmInfo &info);
/** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
*
@@ -144,7 +150,12 @@ public:
*
* @return a status.
*/
- static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+ static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const AsmGemmInfo &info);
/** Checks if activation is supported by the gemm assembly dispatcher
*
* @param[in] activation Activation to check
@@ -167,8 +178,8 @@ public:
}
// Inherited methods overridden:
- void prepare(ITensorPack &tensors) override;
- void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h
index ae1cffb659..e23b88a777 100644
--- a/src/cpu/utils/CpuAuxTensorHandler.h
+++ b/src/cpu/utils/CpuAuxTensorHandler.h
@@ -39,25 +39,26 @@ namespace cpu
class CpuAuxTensorHandler
{
public:
- CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+ CpuAuxTensorHandler(
+ int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
: _tensor()
{
- if(info.total_size() == 0)
+ if (info.total_size() == 0)
{
return;
}
_tensor.allocator()->soft_init(info);
ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id));
- if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
+ if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
{
- if(!bypass_alloc)
+ if (!bypass_alloc)
{
_tensor.allocator()->allocate();
ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
}
- if(pack_inject)
+ if (pack_inject)
{
pack.add_tensor(slot_id, &_tensor);
_injected_tensor_pack = &pack;
@@ -70,22 +71,21 @@ public:
}
}
- CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor)
- : _tensor()
+ CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) : _tensor()
{
_tensor.allocator()->soft_init(info);
- if(info.total_size() <= tensor.info()->total_size())
+ if (info.total_size() <= tensor.info()->total_size())
{
_tensor.allocator()->import_memory(tensor.buffer());
}
}
- CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete;
+ CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete;
CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete;
~CpuAuxTensorHandler()
{
- if(_injected_tensor_pack)
+ if (_injected_tensor_pack)
{
_injected_tensor_pack->remove_tensor(_injected_slot_id);
}
@@ -103,9 +103,9 @@ public:
private:
Tensor _tensor{};
- ITensorPack *_injected_tensor_pack{ nullptr };
- int _injected_slot_id{ TensorType::ACL_UNKNOWN };
+ ITensorPack *_injected_tensor_pack{nullptr};
+ int _injected_slot_id{TensorType::ACL_UNKNOWN};
};
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */