From 473cb01e84cef6cab057e9492bfa3b68f708e5d7 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Tue, 23 Feb 2021 11:48:12 +0000 Subject: Remove Compute Vision CL support Resolves COMPMID-4151 Change-Id: I46f541efe8c4087f27794d2e158b6c1547d459ba Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5160 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio --- src/core/CL/CLKernelLibrary.cpp | 249 +-- src/core/CL/CLKernels.h | 34 - src/core/CL/cl_kernels/absdiff.cl | 65 - src/core/CL/cl_kernels/accumulate.cl | 130 -- src/core/CL/cl_kernels/canny.cl | 454 ----- src/core/CL/cl_kernels/channel_combine.cl | 416 ----- src/core/CL/cl_kernels/channel_extract.cl | 272 --- src/core/CL/cl_kernels/color_convert.cl | 1911 -------------------- src/core/CL/cl_kernels/convolution3x3.cl | 137 -- src/core/CL/cl_kernels/convolution5x5.cl | 287 --- src/core/CL/cl_kernels/convolution7x7.cl | 338 ---- src/core/CL/cl_kernels/convolution9x9.cl | 403 ----- src/core/CL/cl_kernels/convolution_rectangle.cl | 118 -- src/core/CL/cl_kernels/derivative.cl | 80 - src/core/CL/cl_kernels/dilate.cl | 56 - src/core/CL/cl_kernels/erode.cl | 56 - src/core/CL/cl_kernels/fast_corners.cl | 262 --- src/core/CL/cl_kernels/gaussian_pyramid.cl | 113 -- src/core/CL/cl_kernels/harris_corners.cl | 376 ---- src/core/CL/cl_kernels/histogram.cl | 243 --- src/core/CL/cl_kernels/hog.cl | 456 ----- src/core/CL/cl_kernels/integral_image.cl | 100 - src/core/CL/cl_kernels/magnitude_phase.cl | 162 -- src/core/CL/cl_kernels/mean_stddev.cl | 82 - src/core/CL/cl_kernels/minmaxloc.cl | 193 -- src/core/CL/cl_kernels/non_linear_filter3x3.cl | 186 -- src/core/CL/cl_kernels/non_linear_filter5x5.cl | 483 ----- src/core/CL/cl_kernels/non_linear_filter_helpers.h | 145 -- src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl | 521 ------ src/core/CL/cl_kernels/scharr_filter.cl | 124 -- src/core/CL/cl_kernels/tablelookup.cl | 114 -- src/core/CL/cl_kernels/threshold.cl | 104 -- src/core/CL/cl_kernels/warp_affine.cl | 120 -- src/core/CL/cl_kernels/warp_perspective.cl | 128 -- src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp | 104 -- src/core/CL/kernels/CLAbsoluteDifferenceKernel.h | 79 - src/core/CL/kernels/CLAccumulateKernel.cpp | 101 -- src/core/CL/kernels/CLAccumulateKernel.h | 114 -- src/core/CL/kernels/CLBox3x3Kernel.cpp | 81 - src/core/CL/kernels/CLBox3x3Kernel.h | 59 - src/core/CL/kernels/CLCannyEdgeKernel.cpp | 310 ---- src/core/CL/kernels/CLCannyEdgeKernel.h | 185 -- src/core/CL/kernels/CLChannelCombineKernel.cpp | 296 --- src/core/CL/kernels/CLChannelCombineKernel.h | 102 -- src/core/CL/kernels/CLChannelExtractKernel.cpp | 196 -- src/core/CL/kernels/CLChannelExtractKernel.h | 95 - src/core/CL/kernels/CLColorConvertKernel.cpp | 558 ------ src/core/CL/kernels/CLColorConvertKernel.h | 121 -- src/core/CL/kernels/CLConvolutionKernel.cpp | 392 ---- src/core/CL/kernels/CLConvolutionKernel.h | 224 --- src/core/CL/kernels/CLDerivativeKernel.cpp | 155 -- src/core/CL/kernels/CLDerivativeKernel.h | 83 - src/core/CL/kernels/CLDilateKernel.cpp | 70 - src/core/CL/kernels/CLDilateKernel.h | 59 - src/core/CL/kernels/CLErodeKernel.cpp | 70 - src/core/CL/kernels/CLErodeKernel.h | 59 - src/core/CL/kernels/CLFastCornersKernel.cpp | 209 --- src/core/CL/kernels/CLFastCornersKernel.h | 133 -- src/core/CL/kernels/CLGaussian3x3Kernel.cpp | 81 - src/core/CL/kernels/CLGaussian3x3Kernel.h | 59 - src/core/CL/kernels/CLGaussian5x5Kernel.cpp | 55 - src/core/CL/kernels/CLGaussian5x5Kernel.h | 83 - src/core/CL/kernels/CLGaussianPyramidKernel.cpp | 247 --- src/core/CL/kernels/CLGaussianPyramidKernel.h | 111 -- src/core/CL/kernels/CLHOGDescriptorKernel.cpp | 237 --- src/core/CL/kernels/CLHOGDescriptorKernel.h | 122 -- src/core/CL/kernels/CLHOGDetectorKernel.cpp | 146 -- src/core/CL/kernels/CLHOGDetectorKernel.h | 96 - src/core/CL/kernels/CLHarrisCornersKernel.cpp | 149 -- src/core/CL/kernels/CLHarrisCornersKernel.h | 100 - src/core/CL/kernels/CLHistogramKernel.cpp | 253 --- src/core/CL/kernels/CLHistogramKernel.h | 111 -- src/core/CL/kernels/CLIntegralImageKernel.cpp | 146 -- src/core/CL/kernels/CLIntegralImageKernel.h | 86 - src/core/CL/kernels/CLMagnitudePhaseKernel.cpp | 176 -- src/core/CL/kernels/CLMagnitudePhaseKernel.h | 90 - src/core/CL/kernels/CLMeanStdDevKernel.cpp | 156 -- src/core/CL/kernels/CLMeanStdDevKernel.h | 98 - src/core/CL/kernels/CLMedian3x3Kernel.cpp | 88 - src/core/CL/kernels/CLMedian3x3Kernel.h | 59 - src/core/CL/kernels/CLMinMaxLocationKernel.cpp | 246 --- src/core/CL/kernels/CLMinMaxLocationKernel.h | 124 -- src/core/CL/kernels/CLNonLinearFilterKernel.cpp | 104 -- src/core/CL/kernels/CLNonLinearFilterKernel.h | 77 - .../CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp | 78 - .../CL/kernels/CLNonMaximaSuppression3x3Kernel.h | 60 - src/core/CL/kernels/CLScharr3x3Kernel.cpp | 127 -- src/core/CL/kernels/CLScharr3x3Kernel.h | 97 - src/core/CL/kernels/CLSobel3x3Kernel.cpp | 141 -- src/core/CL/kernels/CLSobel3x3Kernel.h | 83 - src/core/CL/kernels/CLSobel5x5Kernel.cpp | 251 --- src/core/CL/kernels/CLSobel5x5Kernel.h | 139 -- src/core/CL/kernels/CLSobel7x7Kernel.cpp | 255 --- src/core/CL/kernels/CLSobel7x7Kernel.h | 139 -- src/core/CL/kernels/CLTableLookupKernel.cpp | 68 - src/core/CL/kernels/CLTableLookupKernel.h | 55 - src/core/CL/kernels/CLThresholdKernel.cpp | 81 - src/core/CL/kernels/CLThresholdKernel.h | 57 - src/core/CL/kernels/CLWarpAffineKernel.cpp | 133 -- src/core/CL/kernels/CLWarpAffineKernel.h | 62 - src/core/CL/kernels/CLWarpPerspectiveKernel.cpp | 105 -- src/core/CL/kernels/CLWarpPerspectiveKernel.h | 59 - src/core/NEON/NEKernels.h | 3 +- src/core/NEON/kernels/NEConvolutionKernel.cpp | 1625 ----------------- src/core/NEON/kernels/NEConvolutionKernel.h | 299 --- .../kernels/NENonMaximaSuppression3x3Kernel.cpp | 516 ------ .../NEON/kernels/NENonMaximaSuppression3x3Kernel.h | 107 -- src/core/NEON/kernels/NERemapKernel.cpp | 237 +++ src/core/NEON/kernels/NERemapKernel.h | 83 + src/runtime/CL/functions/CLAbsoluteDifference.cpp | 42 - src/runtime/CL/functions/CLAccumulate.cpp | 66 - src/runtime/CL/functions/CLBox3x3.cpp | 45 - src/runtime/CL/functions/CLCannyEdge.cpp | 214 --- src/runtime/CL/functions/CLChannelCombine.cpp | 54 - src/runtime/CL/functions/CLChannelExtract.cpp | 54 - src/runtime/CL/functions/CLColorConvert.cpp | 78 - src/runtime/CL/functions/CLConvolution.cpp | 144 -- src/runtime/CL/functions/CLDerivative.cpp | 45 - src/runtime/CL/functions/CLDilate.cpp | 45 - src/runtime/CL/functions/CLEqualizeHistogram.cpp | 124 -- src/runtime/CL/functions/CLErode.cpp | 45 - src/runtime/CL/functions/CLFastCorners.cpp | 141 -- src/runtime/CL/functions/CLGaussian3x3.cpp | 45 - src/runtime/CL/functions/CLGaussian5x5.cpp | 81 - src/runtime/CL/functions/CLGaussianPyramid.cpp | 214 --- src/runtime/CL/functions/CLHOGDescriptor.cpp | 124 -- src/runtime/CL/functions/CLHOGDetector.cpp | 77 - src/runtime/CL/functions/CLHOGGradient.cpp | 93 - src/runtime/CL/functions/CLHOGMultiDetection.cpp | 282 --- src/runtime/CL/functions/CLHarrisCorners.cpp | 198 -- src/runtime/CL/functions/CLHistogram.cpp | 50 - src/runtime/CL/functions/CLIntegralImage.cpp | 54 - src/runtime/CL/functions/CLLaplacianPyramid.cpp | 112 -- .../CL/functions/CLLaplacianReconstruct.cpp | 108 -- src/runtime/CL/functions/CLMagnitude.cpp | 42 - src/runtime/CL/functions/CLMeanStdDev.cpp | 177 -- src/runtime/CL/functions/CLMedian3x3.cpp | 45 - src/runtime/CL/functions/CLMinMaxLocation.cpp | 108 -- src/runtime/CL/functions/CLNonLinearFilter.cpp | 46 - .../CL/functions/CLNonMaximaSuppression3x3.cpp | 52 - src/runtime/CL/functions/CLOpticalFlow.cpp | 184 -- src/runtime/CL/functions/CLPhase.cpp | 42 - src/runtime/CL/functions/CLScharr3x3.cpp | 45 - src/runtime/CL/functions/CLSobel3x3.cpp | 47 - src/runtime/CL/functions/CLSobel5x5.cpp | 101 -- src/runtime/CL/functions/CLSobel7x7.cpp | 101 -- src/runtime/CL/functions/CLTableLookup.cpp | 42 - src/runtime/CL/functions/CLThreshold.cpp | 43 - src/runtime/CL/functions/CLWarpAffine.cpp | 46 - src/runtime/CL/functions/CLWarpPerspective.cpp | 46 - src/runtime/NEON/functions/NEConvolution.cpp | 149 -- .../NEON/functions/NENonMaximaSuppression3x3.cpp | 50 - src/runtime/NEON/functions/NERemap.cpp | 55 + 153 files changed, 377 insertions(+), 24512 deletions(-) delete mode 100644 src/core/CL/cl_kernels/absdiff.cl delete mode 100644 src/core/CL/cl_kernels/accumulate.cl delete mode 100644 src/core/CL/cl_kernels/canny.cl delete mode 100644 src/core/CL/cl_kernels/channel_combine.cl delete mode 100644 src/core/CL/cl_kernels/channel_extract.cl delete mode 100644 src/core/CL/cl_kernels/color_convert.cl delete mode 100644 src/core/CL/cl_kernels/convolution3x3.cl delete mode 100644 src/core/CL/cl_kernels/convolution5x5.cl delete mode 100644 src/core/CL/cl_kernels/convolution7x7.cl delete mode 100644 src/core/CL/cl_kernels/convolution9x9.cl delete mode 100644 src/core/CL/cl_kernels/convolution_rectangle.cl delete mode 100644 src/core/CL/cl_kernels/derivative.cl delete mode 100644 src/core/CL/cl_kernels/dilate.cl delete mode 100644 src/core/CL/cl_kernels/erode.cl delete mode 100644 src/core/CL/cl_kernels/fast_corners.cl delete mode 100644 src/core/CL/cl_kernels/gaussian_pyramid.cl delete mode 100644 src/core/CL/cl_kernels/harris_corners.cl delete mode 100644 src/core/CL/cl_kernels/histogram.cl delete mode 100644 src/core/CL/cl_kernels/hog.cl delete mode 100644 src/core/CL/cl_kernels/integral_image.cl delete mode 100644 src/core/CL/cl_kernels/magnitude_phase.cl delete mode 100644 src/core/CL/cl_kernels/mean_stddev.cl delete mode 100644 src/core/CL/cl_kernels/minmaxloc.cl delete mode 100644 src/core/CL/cl_kernels/non_linear_filter3x3.cl delete mode 100644 src/core/CL/cl_kernels/non_linear_filter5x5.cl delete mode 100644 src/core/CL/cl_kernels/non_linear_filter_helpers.h delete mode 100644 src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl delete mode 100644 src/core/CL/cl_kernels/scharr_filter.cl delete mode 100644 src/core/CL/cl_kernels/tablelookup.cl delete mode 100644 src/core/CL/cl_kernels/threshold.cl delete mode 100644 src/core/CL/cl_kernels/warp_affine.cl delete mode 100644 src/core/CL/cl_kernels/warp_perspective.cl delete mode 100644 src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp delete mode 100644 src/core/CL/kernels/CLAbsoluteDifferenceKernel.h delete mode 100644 src/core/CL/kernels/CLAccumulateKernel.cpp delete mode 100644 src/core/CL/kernels/CLAccumulateKernel.h delete mode 100644 src/core/CL/kernels/CLBox3x3Kernel.cpp delete mode 100644 src/core/CL/kernels/CLBox3x3Kernel.h delete mode 100644 src/core/CL/kernels/CLCannyEdgeKernel.cpp delete mode 100644 src/core/CL/kernels/CLCannyEdgeKernel.h delete mode 100644 src/core/CL/kernels/CLChannelCombineKernel.cpp delete mode 100644 src/core/CL/kernels/CLChannelCombineKernel.h delete mode 100644 src/core/CL/kernels/CLChannelExtractKernel.cpp delete mode 100644 src/core/CL/kernels/CLChannelExtractKernel.h delete mode 100644 src/core/CL/kernels/CLColorConvertKernel.cpp delete mode 100644 src/core/CL/kernels/CLColorConvertKernel.h delete mode 100644 src/core/CL/kernels/CLConvolutionKernel.cpp delete mode 100644 src/core/CL/kernels/CLConvolutionKernel.h delete mode 100644 src/core/CL/kernels/CLDerivativeKernel.cpp delete mode 100644 src/core/CL/kernels/CLDerivativeKernel.h delete mode 100644 src/core/CL/kernels/CLDilateKernel.cpp delete mode 100644 src/core/CL/kernels/CLDilateKernel.h delete mode 100644 src/core/CL/kernels/CLErodeKernel.cpp delete mode 100644 src/core/CL/kernels/CLErodeKernel.h delete mode 100644 src/core/CL/kernels/CLFastCornersKernel.cpp delete mode 100644 src/core/CL/kernels/CLFastCornersKernel.h delete mode 100644 src/core/CL/kernels/CLGaussian3x3Kernel.cpp delete mode 100644 src/core/CL/kernels/CLGaussian3x3Kernel.h delete mode 100644 src/core/CL/kernels/CLGaussian5x5Kernel.cpp delete mode 100644 src/core/CL/kernels/CLGaussian5x5Kernel.h delete mode 100644 src/core/CL/kernels/CLGaussianPyramidKernel.cpp delete mode 100644 src/core/CL/kernels/CLGaussianPyramidKernel.h delete mode 100644 src/core/CL/kernels/CLHOGDescriptorKernel.cpp delete mode 100644 src/core/CL/kernels/CLHOGDescriptorKernel.h delete mode 100644 src/core/CL/kernels/CLHOGDetectorKernel.cpp delete mode 100644 src/core/CL/kernels/CLHOGDetectorKernel.h delete mode 100644 src/core/CL/kernels/CLHarrisCornersKernel.cpp delete mode 100644 src/core/CL/kernels/CLHarrisCornersKernel.h delete mode 100644 src/core/CL/kernels/CLHistogramKernel.cpp delete mode 100644 src/core/CL/kernels/CLHistogramKernel.h delete mode 100644 src/core/CL/kernels/CLIntegralImageKernel.cpp delete mode 100644 src/core/CL/kernels/CLIntegralImageKernel.h delete mode 100644 src/core/CL/kernels/CLMagnitudePhaseKernel.cpp delete mode 100644 src/core/CL/kernels/CLMagnitudePhaseKernel.h delete mode 100644 src/core/CL/kernels/CLMeanStdDevKernel.cpp delete mode 100644 src/core/CL/kernels/CLMeanStdDevKernel.h delete mode 100644 src/core/CL/kernels/CLMedian3x3Kernel.cpp delete mode 100644 src/core/CL/kernels/CLMedian3x3Kernel.h delete mode 100644 src/core/CL/kernels/CLMinMaxLocationKernel.cpp delete mode 100644 src/core/CL/kernels/CLMinMaxLocationKernel.h delete mode 100644 src/core/CL/kernels/CLNonLinearFilterKernel.cpp delete mode 100644 src/core/CL/kernels/CLNonLinearFilterKernel.h delete mode 100644 src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp delete mode 100644 src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h delete mode 100644 src/core/CL/kernels/CLScharr3x3Kernel.cpp delete mode 100644 src/core/CL/kernels/CLScharr3x3Kernel.h delete mode 100644 src/core/CL/kernels/CLSobel3x3Kernel.cpp delete mode 100644 src/core/CL/kernels/CLSobel3x3Kernel.h delete mode 100644 src/core/CL/kernels/CLSobel5x5Kernel.cpp delete mode 100644 src/core/CL/kernels/CLSobel5x5Kernel.h delete mode 100644 src/core/CL/kernels/CLSobel7x7Kernel.cpp delete mode 100644 src/core/CL/kernels/CLSobel7x7Kernel.h delete mode 100644 src/core/CL/kernels/CLTableLookupKernel.cpp delete mode 100644 src/core/CL/kernels/CLTableLookupKernel.h delete mode 100644 src/core/CL/kernels/CLThresholdKernel.cpp delete mode 100644 src/core/CL/kernels/CLThresholdKernel.h delete mode 100644 src/core/CL/kernels/CLWarpAffineKernel.cpp delete mode 100644 src/core/CL/kernels/CLWarpAffineKernel.h delete mode 100644 src/core/CL/kernels/CLWarpPerspectiveKernel.cpp delete mode 100644 src/core/CL/kernels/CLWarpPerspectiveKernel.h delete mode 100644 src/core/NEON/kernels/NEConvolutionKernel.cpp delete mode 100644 src/core/NEON/kernels/NEConvolutionKernel.h delete mode 100644 src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp delete mode 100644 src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h create mode 100644 src/core/NEON/kernels/NERemapKernel.cpp create mode 100644 src/core/NEON/kernels/NERemapKernel.h delete mode 100644 src/runtime/CL/functions/CLAbsoluteDifference.cpp delete mode 100644 src/runtime/CL/functions/CLAccumulate.cpp delete mode 100644 src/runtime/CL/functions/CLBox3x3.cpp delete mode 100644 src/runtime/CL/functions/CLCannyEdge.cpp delete mode 100644 src/runtime/CL/functions/CLChannelCombine.cpp delete mode 100644 src/runtime/CL/functions/CLChannelExtract.cpp delete mode 100644 src/runtime/CL/functions/CLColorConvert.cpp delete mode 100644 src/runtime/CL/functions/CLConvolution.cpp delete mode 100644 src/runtime/CL/functions/CLDerivative.cpp delete mode 100644 src/runtime/CL/functions/CLDilate.cpp delete mode 100644 src/runtime/CL/functions/CLEqualizeHistogram.cpp delete mode 100644 src/runtime/CL/functions/CLErode.cpp delete mode 100644 src/runtime/CL/functions/CLFastCorners.cpp delete mode 100644 src/runtime/CL/functions/CLGaussian3x3.cpp delete mode 100644 src/runtime/CL/functions/CLGaussian5x5.cpp delete mode 100644 src/runtime/CL/functions/CLGaussianPyramid.cpp delete mode 100644 src/runtime/CL/functions/CLHOGDescriptor.cpp delete mode 100644 src/runtime/CL/functions/CLHOGDetector.cpp delete mode 100644 src/runtime/CL/functions/CLHOGGradient.cpp delete mode 100644 src/runtime/CL/functions/CLHOGMultiDetection.cpp delete mode 100644 src/runtime/CL/functions/CLHarrisCorners.cpp delete mode 100644 src/runtime/CL/functions/CLHistogram.cpp delete mode 100644 src/runtime/CL/functions/CLIntegralImage.cpp delete mode 100644 src/runtime/CL/functions/CLLaplacianPyramid.cpp delete mode 100644 src/runtime/CL/functions/CLLaplacianReconstruct.cpp delete mode 100644 src/runtime/CL/functions/CLMagnitude.cpp delete mode 100644 src/runtime/CL/functions/CLMeanStdDev.cpp delete mode 100644 src/runtime/CL/functions/CLMedian3x3.cpp delete mode 100644 src/runtime/CL/functions/CLMinMaxLocation.cpp delete mode 100644 src/runtime/CL/functions/CLNonLinearFilter.cpp delete mode 100644 src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp delete mode 100644 src/runtime/CL/functions/CLOpticalFlow.cpp delete mode 100644 src/runtime/CL/functions/CLPhase.cpp delete mode 100644 src/runtime/CL/functions/CLScharr3x3.cpp delete mode 100644 src/runtime/CL/functions/CLSobel3x3.cpp delete mode 100644 src/runtime/CL/functions/CLSobel5x5.cpp delete mode 100644 src/runtime/CL/functions/CLSobel7x7.cpp delete mode 100644 src/runtime/CL/functions/CLTableLookup.cpp delete mode 100644 src/runtime/CL/functions/CLThreshold.cpp delete mode 100644 src/runtime/CL/functions/CLWarpAffine.cpp delete mode 100644 src/runtime/CL/functions/CLWarpPerspective.cpp delete mode 100644 src/runtime/NEON/functions/NEConvolution.cpp delete mode 100644 src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp create mode 100644 src/runtime/NEON/functions/NERemap.cpp (limited to 'src') diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 75f76ea344..14d3a2cad5 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -177,10 +177,6 @@ std::string decompress_zlib(const std::string &str) using namespace arm_compute; const std::map CLKernelLibrary::_kernel_program_map = { - { "absdiff", "absdiff.cl" }, - { "accumulate", "accumulate.cl" }, - { "accumulate_squared", "accumulate.cl" }, - { "accumulate_weighted", "accumulate.cl" }, { "activation_layer", "activation_layer.cl" }, { "activation_layer_quant", "activation_layer_quant.cl" }, { "activation_layer_quant_f32", "activation_layer_quant.cl" }, @@ -200,21 +196,8 @@ const std::map CLKernelLibrary::_kernel_program_map = { "bitwise_not", "bitwise_op.cl" }, { "bounding_box_transform", "bounding_box_transform.cl" }, { "bounding_box_transform_quantized", "bounding_box_transform_quantized.cl" }, - { "channel_combine_NV", "channel_combine.cl" }, - { "channel_combine_RGB888", "channel_combine.cl" }, - { "channel_combine_RGBA8888", "channel_combine.cl" }, - { "channel_combine_UYVY422", "channel_combine.cl" }, - { "channel_combine_YUYV422", "channel_combine.cl" }, { "channel_shuffle_nchw", "channel_shuffle.cl" }, { "channel_shuffle_nhwc", "channel_shuffle.cl" }, - { "channel_extract_NV12", "channel_extract.cl" }, - { "channel_extract_NV21", "channel_extract.cl" }, - { "channel_extract_RGB888", "channel_extract.cl" }, - { "channel_extract_RGBA8888", "channel_extract.cl" }, - { "channel_extract_UYVY422", "channel_extract.cl" }, - { "channel_extract_YUYV422", "channel_extract.cl" }, - { "combine_gradients_L1", "canny.cl" }, - { "combine_gradients_L2", "canny.cl" }, { "compare_equal", "comparisons.cl" }, { "compare_equal_quantized", "comparisons.cl" }, { "compare_notequal", "comparisons.cl" }, @@ -232,25 +215,11 @@ const std::map CLKernelLibrary::_kernel_program_map = { "concatenate_height", "concatenate.cl" }, { "concatenate_width_x2", "concatenate.cl" }, { "concatenate_width_x4", "concatenate.cl" }, - { "convolution_rectangle", "convolution_rectangle.cl" }, { "col2im", "col2im.cl" }, { "convert_depth_down", "depth_convert.cl" }, { "convert_depth_up", "depth_convert.cl" }, { "convert_fc_weights", "convert_fc_weights.cl" }, - { "convolution3x3_static", "convolution3x3.cl" }, - { "convolution5x5_static", "convolution5x5.cl" }, - { "convolution7x7_static", "convolution7x7.cl" }, - { "convolution9x9_static", "convolution9x9.cl" }, - { "convolution_separable1x5_static", "convolution5x5.cl" }, - { "convolution_separable5x1_static", "convolution5x5.cl" }, - { "convolution_separable1x7_static", "convolution7x7.cl" }, - { "convolution_separable7x1_static", "convolution7x7.cl" }, - { "convolution_separable1x9_static", "convolution9x9.cl" }, - { "convolution_separable9x1_static", "convolution9x9.cl" }, { "copy_tensor", "copy_tensor.cl" }, - { "copy_plane", "channel_extract.cl" }, - { "copy_planes_3p", "channel_combine.cl" }, - { "copy_to_keypoint", "fast_corners.cl" }, { "crop_tensor", "crop_tensor.cl" }, { "deconvolution_reshape", "deconvolution_layer.cl" }, { "deconvolution_upsample", "deconvolution_layer.cl" }, @@ -275,8 +244,6 @@ const std::map CLKernelLibrary::_kernel_program_map = { "dequantization_layer", "dequantization_layer.cl" }, { "dequantization_layer_per_channel_nhwc", "dequantization_layer.cl" }, { "dequantization_layer_per_channel_nchw", "dequantization_layer.cl" }, - { "derivative", "derivative.cl" }, - { "dilate", "dilate.cl" }, { "direct_convolution_nhwc", "direct_convolution.cl" }, { "direct_convolution1x1", "direct_convolution1x1.cl" }, { "direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl" }, @@ -303,8 +270,6 @@ const std::map CLKernelLibrary::_kernel_program_map = { "elementwise_operation_SQUARED_DIFF_quantized", "elementwise_operation_quantized.cl" }, { "elementwise_operation_PRELU_quantized", "elementwise_operation_quantized.cl" }, { "elementwise_unary", "elementwise_unary.cl" }, - { "erode", "erode.cl" }, - { "fast_corners", "fast_corners.cl" }, { "fft_digit_reverse_axis_0", "fft_digit_reverse.cl" }, { "fft_digit_reverse_axis_1", "fft_digit_reverse.cl" }, { "fft_radix_2_first_stage_axis_0", "fft.cl" }, @@ -334,12 +299,9 @@ const std::map CLKernelLibrary::_kernel_program_map = { "fft_scale_conj", "fft_scale.cl" }, { "fill_image_borders_constant", "fill_border.cl" }, { "fill_image_borders_replicate", "fill_border.cl" }, - { "finalize", "optical_flow_pyramid_lk.cl" }, { "floor_layer", "floor.cl" }, { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" }, { "gather", "gather.cl" }, - { "gaussian1x5_sub_x", "gaussian_pyramid.cl" }, - { "gaussian5x1_sub_y", "gaussian_pyramid.cl" }, { "gemm_ma_f16", "gemm.cl" }, { "gemm_ma_f32", "gemm.cl" }, { "gemm_mv", "gemv.cl" }, @@ -384,17 +346,6 @@ const std::map CLKernelLibrary::_kernel_program_map = { "gemmlowp_output_stage_quantize_down_float", "gemmlowp.cl" }, { "generate_proposals_compute_all_anchors", "generate_proposals.cl" }, { "generate_proposals_compute_all_anchors_quantized", "generate_proposals_quantized.cl" }, - { "harris_score_3x3", "harris_corners.cl" }, - { "harris_score_5x5", "harris_corners.cl" }, - { "harris_score_7x7", "harris_corners.cl" }, - { "hist_border_kernel", "histogram.cl" }, - { "hist_border_kernel_fixed", "histogram.cl" }, - { "hist_local_kernel", "histogram.cl" }, - { "hist_local_kernel_fixed", "histogram.cl" }, - { "hog_block_normalization", "hog.cl" }, - { "hog_detector", "hog.cl" }, - { "hog_orientation_binning", "hog.cl" }, - { "hysteresis", "canny.cl" }, { "im2col1x1_stridex1_nchw", "im2col.cl" }, { "im2col3x3_nchw", "im2col.cl" }, { "im2col5x5_nchw", "im2col.cl" }, @@ -404,36 +355,14 @@ const std::map CLKernelLibrary::_kernel_program_map = { "im2col3x3_nhwc", "im2col.cl" }, { "im2col9x9_nhwc", "im2col.cl" }, { "im2col_generic_nhwc", "im2col.cl" }, - { "init_level", "optical_flow_pyramid_lk.cl" }, - { "init_level_max", "optical_flow_pyramid_lk.cl" }, - { "init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl" }, { "instance_normalization", "instance_normalization.cl" }, - { "integral_horizontal", "integral_image.cl" }, - { "integral_vertical", "integral_image.cl" }, - { "IYUV_to_NV12_bt709", "color_convert.cl" }, - { "IYUV_to_RGB888_bt709", "color_convert.cl" }, - { "IYUV_to_RGBA8888_bt709", "color_convert.cl" }, - { "IYUV_to_YUV444_bt709", "color_convert.cl" }, { "l2_normalize_x", "l2_normalize.cl" }, { "l2_normalize_y", "l2_normalize.cl" }, { "l2_normalize_z", "l2_normalize.cl" }, - { "lktracker_stage0", "optical_flow_pyramid_lk.cl" }, - { "lktracker_stage1", "optical_flow_pyramid_lk.cl" }, - { "magnitude_phase", "magnitude_phase.cl" }, { "max_unpooling_layer_2", "unpooling_layer.cl" }, - { "mean_stddev_accumulate", "mean_stddev.cl" }, { "mean_stddev_normalization", "mean_stddev_normalization.cl" }, { "memset", "memset.cl" }, - { "minmax", "minmaxloc.cl" }, - { "minmax_border", "minmaxloc.cl" }, { "minmax_layer", "minmax_layer.cl" }, - { "minmaxloc", "minmaxloc.cl" }, - { "non_linear_filter_box3x3", "non_linear_filter3x3.cl" }, - { "non_linear_filter_cross3x3", "non_linear_filter3x3.cl" }, - { "non_linear_filter_disk3x3", "non_linear_filter3x3.cl" }, - { "non_linear_filter_box5x5", "non_linear_filter5x5.cl" }, - { "non_linear_filter_cross5x5", "non_linear_filter5x5.cl" }, - { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" }, { "non_max_suppression", "nonmax.cl" }, { "normalization_layer_cross_map", "normalization_layer.cl" }, { "normalization_layer_in_map_nchw", "normalization_layer.cl" }, @@ -442,14 +371,6 @@ const std::map CLKernelLibrary::_kernel_program_map = { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" }, { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" }, { "normalize_planar_yuv_layer_q8_nhwc", "normalize_planar_yuv_layer_quantized.cl" }, - { "NV12_to_IYUV_bt709", "color_convert.cl" }, - { "NV12_to_RGB888_bt709", "color_convert.cl" }, - { "NV12_to_RGBA8888_bt709", "color_convert.cl" }, - { "NV12_to_YUV444_bt709", "color_convert.cl" }, - { "NV21_to_IYUV_bt709", "color_convert.cl" }, - { "NV21_to_RGB888_bt709", "color_convert.cl" }, - { "NV21_to_RGBA8888_bt709", "color_convert.cl" }, - { "NV21_to_YUV444_bt709", "color_convert.cl" }, { "pad_layer_constant", "pad_layer.cl" }, { "pad_layer_symmetric_reflect", "pad_layer.cl" }, { "permute", "permute.cl" }, @@ -485,15 +406,6 @@ const std::map CLKernelLibrary::_kernel_program_map = { "reshape_layer", "reshape_layer.cl" }, { "reshape_to_columns", "convolution_layer.cl" }, { "reverse", "reverse.cl" }, - { "RGB888_to_IYUV_bt709", "color_convert.cl" }, - { "RGB888_to_NV12_bt709", "color_convert.cl" }, - { "RGB888_to_RGBA8888_bt709", "color_convert.cl" }, - { "RGB888_to_U8_bt709", "color_convert.cl" }, - { "RGB888_to_YUV444_bt709", "color_convert.cl" }, - { "RGBA8888_to_IYUV_bt709", "color_convert.cl" }, - { "RGBA8888_to_NV12_bt709", "color_convert.cl" }, - { "RGBA8888_to_RGB888_bt709", "color_convert.cl" }, - { "RGBA8888_to_YUV444_bt709", "color_convert.cl" }, { "roi_align_layer", "roi_align_layer.cl" }, { "roi_align_layer_quantized", "roi_align_layer_quantized.cl" }, { "roi_pooling_layer", "roi_pooling_layer.cl" }, @@ -503,15 +415,9 @@ const std::map CLKernelLibrary::_kernel_program_map = { "scale_bilinear_nhwc", "scale.cl" }, { "scale_bilinear_quantized_nchw", "scale_quantized.cl" }, { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" }, - { "scharr3x3", "scharr_filter.cl" }, { "select_same_rank", "select.cl" }, { "select_different_rank_2", "select.cl" }, { "select_different_rank_n", "select.cl" }, - { "sobel3x3", "sobel_filter.cl" }, - { "sobel_separable5x1", "sobel_filter.cl" }, - { "sobel_separable1x5", "sobel_filter.cl" }, - { "sobel_separable7x1", "sobel_filter.cl" }, - { "sobel_separable1x7", "sobel_filter.cl" }, { "softmax_layer_norm", "softmax_layer.cl" }, { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" }, { "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" }, @@ -526,23 +432,10 @@ const std::map CLKernelLibrary::_kernel_program_map = { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" }, { "stack_layer", "stack_layer.cl" }, { "strided_slice", "slice_ops.cl" }, - { "suppress_non_maximum", "canny.cl" }, - { "tablelookup_U8", "tablelookup.cl" }, - { "tablelookup_S16", "tablelookup.cl" }, - { "threshold_binary", "threshold.cl" }, - { "threshold_range", "threshold.cl" }, { "tile", "tile.cl" }, { "transpose", "transpose.cl" }, - { "UYVY422_to_IYUV_bt709", "color_convert.cl" }, - { "UYVY422_to_NV12_bt709", "color_convert.cl" }, - { "UYVY422_to_RGB888_bt709", "color_convert.cl" }, - { "UYVY422_to_RGBA8888_bt709", "color_convert.cl" }, { "upsample_layer_nchw", "upsample_layer.cl" }, { "upsample_layer_nhwc", "upsample_layer.cl" }, - { "warp_affine_nearest_neighbour", "warp_affine.cl" }, - { "warp_affine_bilinear", "warp_affine.cl" }, - { "warp_perspective_nearest_neighbour", "warp_perspective.cl" }, - { "warp_perspective_bilinear", "warp_perspective.cl" }, { "winograd_filter_transform_2x2_3x3_nchw", "winograd_filter_transform.cl" }, { "winograd_filter_transform_2x1_3x1_nchw", "winograd_filter_transform.cl" }, { "winograd_filter_transform_1x2_1x3_nchw", "winograd_filter_transform.cl" }, @@ -602,23 +495,11 @@ const std::map CLKernelLibrary::_kernel_program_map = { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" }, { "yolo_layer_nchw", "yolo_layer.cl" }, { "yolo_layer_nhwc", "yolo_layer.cl" }, - { "YUYV422_to_IYUV_bt709", "color_convert.cl" }, - { "YUYV422_to_NV12_bt709", "color_convert.cl" }, - { "YUYV422_to_RGB888_bt709", "color_convert.cl" }, - { "YUYV422_to_RGBA8888_bt709", "color_convert.cl" }, }; const std::map CLKernelLibrary::_program_source_map = { #ifdef EMBEDDED_KERNELS - { - "absdiff.cl", -#include "./cl_kernels/absdiff.clembed" - }, - { - "accumulate.cl", -#include "./cl_kernels/accumulate.clembed" - }, { "activation_layer.cl", #include "./cl_kernels/activation_layer.clembed" @@ -646,18 +527,6 @@ const std::map CLKernelLibrary::_program_source_map = { "bounding_box_transform_quantized.cl", #include "./cl_kernels/bounding_box_transform_quantized.clembed" - }, - { - "canny.cl", -#include "./cl_kernels/canny.clembed" - }, - { - "channel_combine.cl", -#include "./cl_kernels/channel_combine.clembed" - }, - { - "channel_extract.cl", -#include "./cl_kernels/channel_extract.clembed" }, { "channel_shuffle.cl", @@ -674,38 +543,14 @@ const std::map CLKernelLibrary::_program_source_map = { "concatenate.cl", #include "./cl_kernels/concatenate.clembed" - }, - { - "color_convert.cl", -#include "./cl_kernels/color_convert.clembed" }, { "convert_fc_weights.cl", #include "./cl_kernels/convert_fc_weights.clembed" - }, - { - "convolution3x3.cl", -#include "./cl_kernels/convolution3x3.clembed" - }, - { - "convolution5x5.cl", -#include "./cl_kernels/convolution5x5.clembed" - }, - { - "convolution7x7.cl", -#include "./cl_kernels/convolution7x7.clembed" - }, - { - "convolution9x9.cl", -#include "./cl_kernels/convolution9x9.clembed" - }, + }, { "convolution_layer.cl", #include "./cl_kernels/convolution_layer.clembed" - }, - { - "convolution_rectangle.cl", -#include "./cl_kernels/convolution_rectangle.clembed" }, { "copy_tensor.cl", @@ -742,14 +587,6 @@ const std::map CLKernelLibrary::_program_source_map = { "dequantization_layer.cl", #include "./cl_kernels/dequantization_layer.clembed" - }, - { - "derivative.cl", -#include "./cl_kernels/derivative.clembed" - }, - { - "dilate.cl", -#include "./cl_kernels/dilate.clembed" }, { "direct_convolution1x1.cl", @@ -782,14 +619,6 @@ const std::map CLKernelLibrary::_program_source_map = { "elementwise_unary.cl", #include "./cl_kernels/elementwise_unary.clembed" - }, - { - "erode.cl", -#include "./cl_kernels/erode.clembed" - }, - { - "fast_corners.cl", -#include "./cl_kernels/fast_corners.clembed" }, { "fft.cl", @@ -814,10 +643,6 @@ const std::map CLKernelLibrary::_program_source_map = { "gather.cl", #include "./cl_kernels/gather.clembed" - }, - { - "gaussian_pyramid.cl", -#include "./cl_kernels/gaussian_pyramid.clembed" }, { "gemm.cl", @@ -842,10 +667,6 @@ const std::map CLKernelLibrary::_program_source_map = { "generate_proposals_quantized.cl", #include "./cl_kernels/generate_proposals_quantized.clembed" - }, - { - "harris_corners.cl", -#include "./cl_kernels/harris_corners.clembed" }, { "helpers.h", @@ -854,14 +675,6 @@ const std::map CLKernelLibrary::_program_source_map = { "helpers_asymm.h", #include "./cl_kernels/helpers_asymm.hembed" - }, - { - "histogram.cl", -#include "./cl_kernels/histogram.clembed" - }, - { - "hog.cl", -#include "./cl_kernels/hog.clembed" }, { "im2col.cl", @@ -870,22 +683,10 @@ const std::map CLKernelLibrary::_program_source_map = { "instance_normalization.cl", #include "./cl_kernels/instance_normalization.clembed" - }, - { - "integral_image.cl", -#include "./cl_kernels/integral_image.clembed" }, { "l2_normalize.cl", #include "./cl_kernels/l2_normalize.clembed" - }, - { - "magnitude_phase.cl", -#include "./cl_kernels/magnitude_phase.clembed" - }, - { - "mean_stddev.cl", -#include "./cl_kernels/mean_stddev.clembed" }, { "mean_stddev_normalization.cl", @@ -894,26 +695,10 @@ const std::map CLKernelLibrary::_program_source_map = { "memset.cl", #include "./cl_kernels/memset.clembed" - }, - { - "minmaxloc.cl", -#include "./cl_kernels/minmaxloc.clembed" }, { "minmax_layer.cl", #include "./cl_kernels/minmax_layer.clembed" - }, - { - "non_linear_filter3x3.cl", -#include "./cl_kernels/non_linear_filter3x3.clembed" - }, - { - "non_linear_filter5x5.cl", -#include "./cl_kernels/non_linear_filter5x5.clembed" - }, - { - "non_linear_filter_helpers.h", -#include "./cl_kernels/non_linear_filter_helpers.hembed" }, { "nonmax.cl", @@ -934,10 +719,6 @@ const std::map CLKernelLibrary::_program_source_map = { "batchnormalization_layer.cl", #include "./cl_kernels/batchnormalization_layer.clembed" - }, - { - "optical_flow_pyramid_lk.cl", -#include "./cl_kernels/optical_flow_pyramid_lk.clembed" }, { "pad_layer.cl", @@ -1018,18 +799,10 @@ const std::map CLKernelLibrary::_program_source_map = { "scale_quantized.cl", #include "./cl_kernels/scale_quantized.clembed" - }, - { - "scharr_filter.cl", -#include "./cl_kernels/scharr_filter.clembed" }, { "select.cl", #include "./cl_kernels/select.clembed" - }, - { - "sobel_filter.cl", -#include "./cl_kernels/sobel_filter.clembed" }, { "softmax_layer.cl", @@ -1054,14 +827,6 @@ const std::map CLKernelLibrary::_program_source_map = { "stack_layer.cl", #include "./cl_kernels/stack_layer.clembed" - }, - { - "tablelookup.cl", -#include "./cl_kernels/tablelookup.clembed" - }, - { - "threshold.cl", -#include "./cl_kernels/threshold.clembed" }, { "tile.cl", @@ -1078,18 +843,6 @@ const std::map CLKernelLibrary::_program_source_map = { "unpooling_layer.cl", #include "./cl_kernels/unpooling_layer.clembed" - }, - { - "warp_affine.cl", -#include "./cl_kernels/warp_affine.clembed" - }, - { - "warp_helpers.h", -#include "./cl_kernels/warp_helpers.hembed" - }, - { - "warp_perspective.cl", -#include "./cl_kernels/warp_perspective.clembed" }, { "winograd_filter_transform.cl", diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h index 7383dce40f..22c9cd9c0c 100644 --- a/src/core/CL/CLKernels.h +++ b/src/core/CL/CLKernels.h @@ -25,23 +25,15 @@ #define ARM_COMPUTE_CLKERNELS_H /* Header regrouping all the CL kernels */ -#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h" -#include "src/core/CL/kernels/CLAccumulateKernel.h" #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h" #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h" #include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h" #include "src/core/CL/kernels/CLBitwiseKernel.h" #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h" -#include "src/core/CL/kernels/CLBox3x3Kernel.h" -#include "src/core/CL/kernels/CLCannyEdgeKernel.h" -#include "src/core/CL/kernels/CLChannelCombineKernel.h" -#include "src/core/CL/kernels/CLChannelExtractKernel.h" #include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h" #include "src/core/CL/kernels/CLCol2ImKernel.h" -#include "src/core/CL/kernels/CLColorConvertKernel.h" #include "src/core/CL/kernels/CLComparisonKernel.h" #include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h" -#include "src/core/CL/kernels/CLConvolutionKernel.h" #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" #include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h" #include "src/core/CL/kernels/CLDepthConvertLayerKernel.h" @@ -51,14 +43,10 @@ #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h" #include "src/core/CL/kernels/CLDequantizationLayerKernel.h" -#include "src/core/CL/kernels/CLDerivativeKernel.h" -#include "src/core/CL/kernels/CLDilateKernel.h" #include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h" -#include "src/core/CL/kernels/CLErodeKernel.h" #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h" #include "src/core/CL/kernels/CLFFTRadixStageKernel.h" #include "src/core/CL/kernels/CLFFTScaleKernel.h" -#include "src/core/CL/kernels/CLFastCornersKernel.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h" #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h" @@ -77,28 +65,14 @@ #include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" #include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h" #include "src/core/CL/kernels/CLGatherKernel.h" -#include "src/core/CL/kernels/CLGaussian3x3Kernel.h" -#include "src/core/CL/kernels/CLGaussian5x5Kernel.h" -#include "src/core/CL/kernels/CLGaussianPyramidKernel.h" #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h" -#include "src/core/CL/kernels/CLHOGDescriptorKernel.h" -#include "src/core/CL/kernels/CLHOGDetectorKernel.h" -#include "src/core/CL/kernels/CLHarrisCornersKernel.h" -#include "src/core/CL/kernels/CLHistogramKernel.h" #include "src/core/CL/kernels/CLIm2ColKernel.h" #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h" -#include "src/core/CL/kernels/CLIntegralImageKernel.h" #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "src/core/CL/kernels/CLLKTrackerKernel.h" -#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" #include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" -#include "src/core/CL/kernels/CLMeanStdDevKernel.h" #include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" -#include "src/core/CL/kernels/CLMedian3x3Kernel.h" #include "src/core/CL/kernels/CLMinMaxLayerKernel.h" -#include "src/core/CL/kernels/CLMinMaxLocationKernel.h" -#include "src/core/CL/kernels/CLNonLinearFilterKernel.h" -#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h" #include "src/core/CL/kernels/CLNormalizationLayerKernel.h" #include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" #include "src/core/CL/kernels/CLPadLayerKernel.h" @@ -114,22 +88,14 @@ #include "src/core/CL/kernels/CLReorgLayerKernel.h" #include "src/core/CL/kernels/CLReverseKernel.h" #include "src/core/CL/kernels/CLScaleKernel.h" -#include "src/core/CL/kernels/CLScharr3x3Kernel.h" #include "src/core/CL/kernels/CLSelectKernel.h" -#include "src/core/CL/kernels/CLSobel3x3Kernel.h" -#include "src/core/CL/kernels/CLSobel5x5Kernel.h" -#include "src/core/CL/kernels/CLSobel7x7Kernel.h" #include "src/core/CL/kernels/CLSoftmaxLayerKernel.h" #include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h" #include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h" #include "src/core/CL/kernels/CLStackLayerKernel.h" #include "src/core/CL/kernels/CLStridedSliceKernel.h" -#include "src/core/CL/kernels/CLTableLookupKernel.h" -#include "src/core/CL/kernels/CLThresholdKernel.h" #include "src/core/CL/kernels/CLTileKernel.h" #include "src/core/CL/kernels/CLTransposeKernel.h" -#include "src/core/CL/kernels/CLWarpAffineKernel.h" -#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h" #include "src/core/CL/kernels/CLWeightsReshapeKernel.h" #include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h" #include "src/core/CL/kernels/CLWinogradInputTransformKernel.h" diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl deleted file mode 100644 index a09caf5dc5..0000000000 --- a/src/core/CL/cl_kernels/absdiff.cl +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Calculate the absolute difference of two input images. - * - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short - * - * @param[in] in1_ptr Pointer to the first source image. Supported data types: U8, S16 - * @param[in] in1_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] in1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] in1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] in2_ptr Pointer to the second source image. Supported data types: U8, S16 - * @param[in] in2_stride_x Stride of the second source image in X dimension (in bytes) - * @param[in] in2_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the second source image in Y dimension (in bytes) - * @param[in] in2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the second source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void absdiff( - IMAGE_DECLARATION(in1), - IMAGE_DECLARATION(in2), - IMAGE_DECLARATION(out)) -{ - Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); - Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); - VEC_DATA_TYPE(DATA_TYPE_OUT, 16) - in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); - - vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); -} diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl deleted file mode 100644 index 9e37830f1b..0000000000 --- a/src/core/CL/cl_kernels/accumulate.cl +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function accumulates an input image into output image. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] accu_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] accu_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] accu_step_x accu_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] accu_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] accu_step_y accu_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] accu_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void accumulate( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(accu)) -{ - // Get pixels pointer - Image input = CONVERT_TO_IMAGE_STRUCT(input); - Image accu = CONVERT_TO_IMAGE_STRUCT(accu); - - // Load data - uchar16 in_data = vload16(0, input.ptr); - short16 accu_data = vload16(0, (__global short *)accu.ptr); - - // Perform accumulation - short16 res = add_sat(convert_short16(in_data), accu_data); - - // Store result - vstore16(res, 0, (__global short *)accu.ptr); -} - -/** This function accumulates a weighted value from an input image to an output image. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] accu_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] accu_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] accu_step_x accu_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] accu_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] accu_step_y accu_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] accu_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] alpha The float scalar value with a value in the range of 0 to 1 - */ -__kernel void accumulate_weighted( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(accu), - const float alpha) -{ - // Get pixels pointer - Image input = CONVERT_TO_IMAGE_STRUCT(input); - Image accu = CONVERT_TO_IMAGE_STRUCT(accu); - - // Load data - const float16 in_data = convert_float16(vload16(0, input.ptr)); - const float16 accu_data = convert_float16(vload16(0, accu.ptr)); - - // Calculate weighted accumulation - const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data); - - // Store result - vstore16(res, 0, accu.ptr); -} - -/** This function accumulates a squared value from an input image to an output image. - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] accu_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] accu_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] accu_step_x accu_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] accu_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] accu_step_y accu_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] accu_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] shift The U32 scalar value with a value in the range of 0 to 15 - */ -__kernel void accumulate_squared( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(accu), - const uint shift) -{ - // Get pixels pointer - Image input = CONVERT_TO_IMAGE_STRUCT(input); - Image accu = CONVERT_TO_IMAGE_STRUCT(accu); - - // Load data - ushort16 in_data = convert_ushort16(vload16(0, input.ptr)); - uint16 accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr)); - - // Calculate squared accumulation - short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift)); - - // Store result - vstore16(res, 0, (__global short *)accu.ptr); -} diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl deleted file mode 100644 index bcff8438db..0000000000 --- a/src/core/CL/cl_kernels/canny.cl +++ /dev/null @@ -1,454 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Calculate the magnitude and phase from horizontal and vertical result of sobel result. - * - * @note The calculation of gradient uses level 1 normalisation. - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short - * - * @param[in] src1_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 - * @param[in] src1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src2_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 - * @param[in] src2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] grad_ptr Pointer to the gradient output. Supported data types: U16, U32 - * @param[in] grad_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] grad_step_x grad_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] grad_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] grad_step_y grad_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] grad_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] angle_ptr Pointer to the angle output. Supported data types: U8 - * @param[in] angle_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] angle_step_x angle_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] angle_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] angle_step_y angle_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] angle_offset_first_element_in_bytes The offset of the first element of the output - */ -__kernel void combine_gradients_L1( - IMAGE_DECLARATION(src1), - IMAGE_DECLARATION(src2), - IMAGE_DECLARATION(grad), - IMAGE_DECLARATION(angle)) -{ - // Construct images - Image src1 = CONVERT_TO_IMAGE_STRUCT(src1); - Image src2 = CONVERT_TO_IMAGE_STRUCT(src2); - Image grad = CONVERT_TO_IMAGE_STRUCT(grad); - Image angle = CONVERT_TO_IMAGE_STRUCT(angle); - - // Load sobel horizontal and vertical values - VEC_DATA_TYPE(DATA_TYPE_IN, 4) - h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr); - VEC_DATA_TYPE(DATA_TYPE_IN, 4) - v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr); - - /* Calculate the gradient, using level 1 normalisation method */ - VEC_DATA_TYPE(DATA_TYPE_OUT, 4) - m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4)); - - /* Calculate the angle */ - float4 p = 180.0f * atan2pi(convert_float4(v), convert_float4(h)); - - /* Remap angle to range [0, 256) */ - p = select(p, p + 180.0f, p < 0.0f); - - /* Store results */ - vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr); - vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr); -} - -/** Calculate the gradient and angle from horizontal and vertical result of sobel result. - * - * @note The calculation of gradient uses level 2 normalisation - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short - * - * @param[in] src1_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 - * @param[in] src1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src2_ptr Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32 - * @param[in] src2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] grad_ptr Pointer to the gradient output. Supported data types: U16, U32 - * @param[in] grad_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] grad_step_x grad_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] grad_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] grad_step_y grad_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] grad_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] angle_ptr Pointer to the angle output. Supported data types: U8 - * @param[in] angle_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] angle_step_x angle_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] angle_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] angle_step_y angle_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] angle_offset_first_element_in_bytes The offset of the first element of the output - */ -__kernel void combine_gradients_L2( - IMAGE_DECLARATION(src1), - IMAGE_DECLARATION(src2), - IMAGE_DECLARATION(grad), - IMAGE_DECLARATION(angle)) -{ - // Construct images - Image src1 = CONVERT_TO_IMAGE_STRUCT(src1); - Image src2 = CONVERT_TO_IMAGE_STRUCT(src2); - Image grad = CONVERT_TO_IMAGE_STRUCT(grad); - Image angle = CONVERT_TO_IMAGE_STRUCT(angle); - - // Load sobel horizontal and vertical values - float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr)); - float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr)); - - /* Calculate the gradient, using level 2 normalisation method */ - float4 m = sqrt(h * h + v * v); - - /* Calculate the angle */ - float4 p = 180.0f * atan2pi(v, h); - - /* Remap angle to range [0, 256) */ - p = select(p, p + 180.0f, p < 0.0f); - - /* Store results */ - vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr); - vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr); -} - -#define EDGE 255 -#define NO_EDGE 0 - -/** Array that holds the relative coordinates offset for the neighbouring pixels. - */ -__constant short4 neighbours_coords[] = -{ - { -1, 0, 1, 0 }, // 0 - { -1, -1, 1, 1 }, // 45 - { 0, -1, 0, 1 }, // 90 - { 1, -1, -1, 1 }, // 135 -}; - -/** Perform non maximum suppression. - * - * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short - * - * @param[in] grad_ptr Pointer to the gradient output. Supported data types: S16, S32 - * @param[in] grad_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] grad_step_x grad_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] grad_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] grad_step_y grad_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] grad_offset_first_element_in_bytes The offset of the first element of the output - * @param[in] angle_ptr Pointer to the angle output. Supported data types: U8 - * @param[in] angle_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] angle_step_x angle_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] angle_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] angle_step_y angle_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] angle_offset_first_element_in_bytes TThe offset of the first element of the output - * @param[out] non_max_ptr Pointer to the non maximum suppressed output. Supported data types: U16, U32 - * @param[in] non_max_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] non_max_step_x non_max_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] non_max_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] non_max_step_y non_max_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] non_max_offset_first_element_in_bytes The offset of the first element of the output - * @param[in] lower_thr The low threshold - */ -__kernel void suppress_non_maximum( - IMAGE_DECLARATION(grad), - IMAGE_DECLARATION(angle), - IMAGE_DECLARATION(non_max), - uint lower_thr) -{ - // Construct images - Image grad = CONVERT_TO_IMAGE_STRUCT(grad); - Image angle = CONVERT_TO_IMAGE_STRUCT(angle); - Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max); - - // Index - const int x = get_global_id(0); - const int y = get_global_id(1); - - // Get gradient and angle - DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr); - uchar an = *((__global uchar *)angle.ptr); - - // Early return if not greater than lower threshold - if(gradient <= lower_thr) - { - return; - } - - // Divide the whole round into 4 directions - DATA_TYPE_OUT q_an; - - if(an < 22.5f || an >= 157.5f) - { - q_an = 0; - } - else if(an < 67.5f) - { - q_an = 1; - } - else if(an < 112.5f) - { - q_an = 2; - } - else - { - q_an = 3; - } - - // Find the two pixels in the perpendicular direction - short2 x_p = neighbours_coords[q_an].s02; - short2 y_p = neighbours_coords[q_an].s13; - DATA_TYPE_IN g1 = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x)); - DATA_TYPE_IN g2 = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y)); - - if((gradient > g1) && (gradient > g2)) - { - __global uchar *non_max_addr = non_max_ptr + non_max_offset_first_element_in_bytes + x * non_max_stride_x + y * non_max_stride_y; - *((global DATA_TYPE_OUT *)non_max_addr) = gradient; - } -} - -#define hysteresis_local_stack_L1 8 // The size of level 1 stack. This has to agree with the host side -#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation - -/** Check whether pixel is valid - * - * Skip the pixel if the early_test fails. - * Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full - * - * @param[in] early_test Boolean condition based on the minv check and visited buffer check - * @param[in] x_pos X-coordinate of pixel that is going to be recorded, has to be within the boundary - * @param[in] y_pos Y-coordinate of pixel that is going to be recorded, has to be within the boundary - * @param[in] x_cur X-coordinate of current central pixel - * @param[in] y_cur Y-coordinate of current central pixel - */ -#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur) \ - { \ - if(!early_test) \ - { \ - /* Number of elements in the local stack 1, points to next available entry */ \ - c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)); \ - \ - if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */ \ - goto pop_stack; \ - \ - /* The pixel that has already been recorded is ignored */ \ - if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1)) \ - { \ - l1_ptr[c] = (short2)(x_pos, y_pos); \ - *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1; \ - } \ - } \ - } - -/** Perform hysteresis. - * - * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short - * - * @param[in] src_ptr Pointer to the input image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] out_ptr Pointer to the output image. Supported data types: U8 - * @param[in] out_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] visited_ptr Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32 - * @param[in] visited_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] visited_step_x visited_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] visited_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] visited_step_y visited_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] visited_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] recorded_ptr Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32 - * @param[in] recorded_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] recorded_step_x recorded_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] recorded_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] recorded_step_y recorded_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] recorded_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] l1_stack_ptr Pointer to the l1 stack of a pixel. Supported data types: S32 - * @param[in] l1_stack_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] l1_stack_step_x l1_stack_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] l1_stack_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] l1_stack_step_y l1_stack_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] l1_stack_offset_first_element_in_bytes The offset of the first element of the output - * @param[out] l1_stack_counter_ptr Pointer to the l1 stack counters of an image. Supported data types: U8 - * @param[in] l1_stack_counter_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] l1_stack_counter_step_x l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] l1_stack_counter_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] l1_stack_counter_step_y l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output - * @param[in] low_thr The lower threshold - * @param[in] up_thr The upper threshold - * @param[in] width The width of the image. - * @param[in] height The height of the image - */ -kernel void hysteresis( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(out), - IMAGE_DECLARATION(visited), - IMAGE_DECLARATION(recorded), - IMAGE_DECLARATION(l1_stack), - IMAGE_DECLARATION(l1_stack_counter), - uint low_thr, - uint up_thr, - int width, - int height) -{ - // Create images - Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src); - Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out); - Image visited = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited); - Image recorded = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded); - Image l1_stack = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack); - Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter); - - // Index - int x = get_global_id(0); - int y = get_global_id(1); - - // Load value - DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y)); - - // If the pixel has already been marked as NO_EDGE, store that value in the output and return - if(val == NO_EDGE) - { - *offset(&out, x, y) = NO_EDGE; - return; - } - - // Return if it is a MAYBE pixel. Such pixels will become edges if near a strong edge - if(val <= up_thr) - { - return; - } - - // Init local stack 2 - short2 stack_L2[hysteresis_local_stack_L2] = { 0 }; - int L2_counter = 0; - - // Perform recursive hysteresis - while(true) - { - // Get L1 stack pointer - __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x); - - // If the pixel has already been visited, proceed with the items in the stack instead - if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0) - { - goto pop_stack; - } - - // Set strong edge - *offset(&out, x, y) = EDGE; - - // If it is the top of stack l2, we don't need check the surrounding pixels - if(L2_counter > (hysteresis_local_stack_L2 - 1)) - { - goto pop_stack2; - } - - // Points to the start of the local stack; - char c; - - VEC_DATA_TYPE(DATA_TYPE_IN, 4) - x_tmp; - uint4 v_tmp; - - // Get direction pixel indices - int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2); - - // Check 8 pixels around for weak edges where low_thr < val <= up_thr - x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N)); - v_tmp = vload4(0, (__global uint *)offset(&visited, W, N)); - check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW - check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N - check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE - - x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y)); - v_tmp = vload4(0, (__global uint *)offset(&visited, W, y)); - check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W - check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E - - x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S)); - v_tmp = vload4(0, (__global uint *)offset(&visited, W, S)); - check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW - check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S - check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE - -#undef check_pixel - -pop_stack: - c = *((__global char *)offset(&l1_stack_counter, x, y)); - - if(c >= 1) - { - *((__global char *)offset(&l1_stack_counter, x, y)) -= 1; - int2 l_c = convert_int2(l1_ptr[c - 1]); - - // Push the current position into level 2 stack - stack_L2[L2_counter].x = x; - stack_L2[L2_counter].y = y; - - x = l_c.x; - y = l_c.y; - - L2_counter++; - - continue; - } - - if(L2_counter > 0) - { - goto pop_stack2; - } - else - { - return; - } - -pop_stack2: - L2_counter--; - x = stack_L2[L2_counter].x; - y = stack_L2[L2_counter].y; - }; -} diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl deleted file mode 100644 index 550d52e9ea..0000000000 --- a/src/core/CL/cl_kernels/channel_combine.cl +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function combines three planes to a single RGB image. - * - * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_ptr Pointer to the destination image. Supported Format: RGB - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_combine_RGB888( - IMAGE_DECLARATION(plane0), - IMAGE_DECLARATION(plane1), - IMAGE_DECLARATION(plane2), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); - Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); - Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data0 = vload16(0, plane0.ptr); - uchar16 data1 = vload16(0, plane1.ptr); - uchar16 data2 = vload16(0, plane2.ptr); - - uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, - data0.s1, data1.s1, data2.s1, - data0.s2, data1.s2, data2.s2, - data0.s3, data1.s3, data2.s3, - data0.s4, data1.s4, data2.s4, - data0.s5); - vstore16(out0, 0, dst.ptr); - - uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6, - data1.s6, data2.s6, data0.s7, - data1.s7, data2.s7, data0.s8, - data1.s8, data2.s8, data0.s9, - data1.s9, data2.s9, data0.sA, - data1.sA); - vstore16(out1, 0, dst.ptr + 16); - - uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB, - data2.sB, data0.sC, data1.sC, - data2.sC, data0.sD, data1.sD, - data2.sD, data0.sE, data1.sE, - data2.sE, data0.sF, data1.sF, - data2.sF); - vstore16(out2, 0, dst.ptr + 32); -} - -/** This function combines three planes to a single RGBA image. - * - * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] plane3_ptr Pointer to the fourth plane. Supported Format: U8 - * @param[in] plane3_stride_x Stride of the fourth plane in X dimension (in bytes) - * @param[in] plane3_step_x plane3_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane3_stride_y Stride of the fourth plane in Y dimension (in bytes) - * @param[in] plane3_step_y plane3_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane - * @param[in] dst_ptr Pointer to the destination image. Supported Format: RGBA - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_combine_RGBA8888( - IMAGE_DECLARATION(plane0), - IMAGE_DECLARATION(plane1), - IMAGE_DECLARATION(plane2), - IMAGE_DECLARATION(plane3), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); - Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); - Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); - Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data0 = vload16(0, plane0.ptr); - uchar16 data1 = vload16(0, plane1.ptr); - uchar16 data2 = vload16(0, plane2.ptr); - uchar16 data3 = vload16(0, plane3.ptr); - - uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0, - data0.s1, data1.s1, data2.s1, data3.s1, - data0.s2, data1.s2, data2.s2, data3.s2, - data0.s3, data1.s3, data2.s3, data3.s3); - vstore16(out0, 0, dst.ptr); - - uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4, - data0.s5, data1.s5, data2.s5, data3.s5, - data0.s6, data1.s6, data2.s6, data3.s6, - data0.s7, data1.s7, data2.s7, data3.s7); - vstore16(out1, 0, dst.ptr + 16); - - uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8, - data0.s9, data1.s9, data2.s9, data3.s9, - data0.sA, data1.sA, data2.sA, data3.sA, - data0.sB, data1.sB, data2.sB, data3.sB); - vstore16(out2, 0, dst.ptr + 32); - - uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC, - data0.sD, data1.sD, data2.sD, data3.sD, - data0.sE, data1.sE, data2.sE, data3.sE, - data0.sF, data1.sF, data2.sF, data3.sF); - vstore16(out3, 0, dst.ptr + 48); -} - -/** This function combines three planes to a single YUYV image. - * - * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_ptr Pointer to the destination image. Supported Format: YUYV - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_combine_YUYV422( - IMAGE_DECLARATION(plane0), - IMAGE_DECLARATION(plane1), - IMAGE_DECLARATION(plane2), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); - Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); - Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data0 = vload16(0, plane0.ptr); - uchar8 data1 = vload8(0, plane1.ptr); - uchar8 data2 = vload8(0, plane2.ptr); - - uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0, - data0.s2, data1.s1, data0.s3, data2.s1, - data0.s4, data1.s2, data0.s5, data2.s2, - data0.s6, data1.s3, data0.s7, data2.s3); - vstore16(out0, 0, dst.ptr); - uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4, - data0.sA, data1.s5, data0.sB, data2.s5, - data0.sC, data1.s6, data0.sD, data2.s6, - data0.sE, data1.s7, data0.sF, data2.s7); - vstore16(out1, 0, dst.ptr + 16); -} - -/** This function combines three planes to a single UYUV image. - * - * @param[in] plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] plane0_step_x plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] plane0_step_y plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] plane1_step_x plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] plane1_step_y plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] plane2_step_x plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] plane2_step_y plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_ptr Pointer to the destination image. Supported Format: UYUV - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_combine_UYVY422( - IMAGE_DECLARATION(plane0), - IMAGE_DECLARATION(plane1), - IMAGE_DECLARATION(plane2), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0); - Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1); - Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data0 = vload16(0, plane0.ptr); - uchar8 data1 = vload8(0, plane1.ptr); - uchar8 data2 = vload8(0, plane2.ptr); - - uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1, - data1.s1, data0.s2, data2.s1, data0.s3, - data1.s2, data0.s4, data2.s2, data0.s5, - data1.s3, data0.s6, data2.s3, data0.s7); - vstore16(out0, 0, dst.ptr); - uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9, - data1.s5, data0.sA, data2.s5, data0.sB, - data1.s6, data0.sC, data2.s6, data0.sD, - data1.s7, data0.sE, data2.s7, data0.sF); - vstore16(out1, 0, dst.ptr + 16); -} - -/** This function combines three planes to a single NV12/NV21 image. - * - * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine. - * - * @param[in] src_plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] src_plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] src_plane0_step_x src_plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] src_plane0_step_y src_plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] src_plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] src_plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] src_plane1_step_x src_plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] src_plane1_step_y src_plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] src_plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] src_plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] src_plane2_step_x src_plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] src_plane2_step_y src_plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_plane0_ptr Pointer to the first plane of the destination image. Supported Format: U8 - * @param[in] dst_plane0_stride_x Stride of the first plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane0_step_x dst_plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane0_stride_y Stride of the first plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane0_step_y dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image - * @param[in] dst_plane1_ptr Pointer to the second plane of the destination image. Supported Format: UV88 - * @param[in] dst_plane1_stride_x Stride of the second plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane1_step_x dst_plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane1_stride_y Stride of the second plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane1_step_y dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image - * @param[in] height Sub-sampled height - */ -__kernel void channel_combine_NV( - IMAGE_DECLARATION(src_plane0), - IMAGE_DECLARATION(src_plane1), - IMAGE_DECLARATION(src_plane2), - IMAGE_DECLARATION(dst_plane0), - IMAGE_DECLARATION(dst_plane1), - uint height) -{ - // Get pixels pointer - Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0); - Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1); - Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2); - Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0); - Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1); - - // Copy plane data - vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr); - vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height)); - - // Create UV place - uchar8 data1 = vload8(0, src_plane1.ptr); - uchar8 data2 = vload8(0, src_plane2.ptr); - -#ifdef NV12 - vstore16(shuffle2(data1, data2, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr); -#elif defined(NV21) - vstore16(shuffle2(data2, data1, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr); -#endif /* NV12 or NV21 */ -} - -/** This function combines three planes to a single YUV444 or IYUV image. - * - * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine. - * - * @param[in] src_plane0_ptr Pointer to the first plane. Supported Format: U8 - * @param[in] src_plane0_stride_x Stride of the first plane in X dimension (in bytes) - * @param[in] src_plane0_step_x src_plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane0_stride_y Stride of the first plane in Y dimension (in bytes) - * @param[in] src_plane0_step_y src_plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane - * @param[in] src_plane1_ptr Pointer to the second plane. Supported Format: U8 - * @param[in] src_plane1_stride_x Stride of the second plane in X dimension (in bytes) - * @param[in] src_plane1_step_x src_plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane1_stride_y Stride of the second plane in Y dimension (in bytes) - * @param[in] src_plane1_step_y src_plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane - * @param[in] src_plane2_ptr Pointer to the third plane. Supported Format: U8 - * @param[in] src_plane2_stride_x Stride of the third plane in X dimension (in bytes) - * @param[in] src_plane2_step_x src_plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_plane2_stride_y Stride of the third plane in Y dimension (in bytes) - * @param[in] src_plane2_step_y src_plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane - * @param[in] dst_plane0_ptr Pointer to the first plane of the destination image. Supported Format: U8 - * @param[in] dst_plane0_stride_x Stride of the first plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane0_step_x dst_plane0_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane0_stride_y Stride of the first plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane0_step_y dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image - * @param[in] dst_plane1_ptr Pointer to the second plane of the destination image. Supported Format: U8 - * @param[in] dst_plane1_stride_x Stride of the second plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane1_step_x dst_plane1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane1_stride_y Stride of the second plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane1_step_y dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image - * @param[in] dst_plane2_ptr Pointer to the third plane of the destination image. Supported Format: U8 - * @param[in] dst_plane2_stride_x Stride of the third plane of the destination image in X dimension (in bytes) - * @param[in] dst_plane2_step_x dst_plane2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_plane2_stride_y Stride of the third plane of the destination image in Y dimension (in bytes) - * @param[in] dst_plane2_step_y dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image - * @param[in] height Sub-sampled height - */ -__kernel void copy_planes_3p( - IMAGE_DECLARATION(src_plane0), - IMAGE_DECLARATION(src_plane1), - IMAGE_DECLARATION(src_plane2), - IMAGE_DECLARATION(dst_plane0), - IMAGE_DECLARATION(dst_plane1), - IMAGE_DECLARATION(dst_plane2), - uint height) -{ - // Get pixels pointer - Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0); - Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1); - Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2); - Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0); - Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1); - Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2); - - // Copy plane data - vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr); -#ifdef YUV444 - vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr); - vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr); -#elif defined(IYUV) - vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height)); - vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr); - vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr); -#endif /* YUV444 or IYUV */ -} diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl deleted file mode 100644 index b64f24814e..0000000000 --- a/src/core/CL/cl_kernels/channel_extract.cl +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function extracts a given channel from an RGB image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: RGB - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_RGB888( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - uchar8 data2 = vload8(0, src.ptr + 16); - -#ifdef CHANNEL_R - vstore4(data.s0369, 0, dst.ptr); - vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4); -#elif defined(CHANNEL_G) - vstore4(data.s147A, 0, dst.ptr); - vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4); -#elif defined(CHANNEL_B) - vstore4(data.s258B, 0, dst.ptr); - vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4); -#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B */ -} - -/** This function extracts a given channel from an RGBA image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: RGBA - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_RGBA8888( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - uchar16 data2 = vload16(0, src.ptr + 16); - -#ifdef CHANNEL_R - vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr); -#elif defined(CHANNEL_G) - vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr); -#elif defined(CHANNEL_B) - vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr); -#elif defined(CHANNEL_A) - vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr); -#endif /* CHANNEL_R or CHANNEL_G or CHANNEL_B or CHANNEL_A */ -} - -/** This function extracts a given channel from an YUYV image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: YUYV - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_YUYV422( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - -#ifdef CHANNEL_Y - vstore8(data.s02468ACE, 0, dst.ptr); -#elif defined(CHANNEL_U) - vstore4(data.s159D, 0, dst.ptr); -#elif defined(CHANNEL_V) - vstore4(data.s37BF, 0, dst.ptr); -#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */ -} - -/** This function extracts a given channel from an UYUV image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: UYUV - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_UYVY422( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - -#ifdef CHANNEL_Y - vstore8(data.s13579BDF, 0, dst.ptr); -#elif defined(CHANNEL_U) - vstore4(data.s048C, 0, dst.ptr); -#elif defined(CHANNEL_V) - vstore4(data.s26AE, 0, dst.ptr); -#endif /* CHANNEL_Y or CHANNEL_U or CHANNEL_V */ -} - -/** This function extracts a given channel from an NV12 image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. - * @warning Only channels UV can be extracted using this kernel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: NV12 (UV88) - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_NV12( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - -#ifdef CHANNEL_U - vstore8(data.s02468ACE, 0, dst.ptr); -#elif defined(CHANNEL_V) - vstore8(data.s13579BDF, 0, dst.ptr); -#endif /* CHANNEL_U or CHANNEL_V */ -} - -/** This function extracts a given channel from an NV21 image. - * - * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel. - * @warning Only channels UV can be extracted using this kernel. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: NV21 (UV88) - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void channel_extract_NV21( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 data = vload16(0, src.ptr); - -#ifdef CHANNEL_U - vstore8(data.s13579BDF, 0, dst.ptr); -#elif defined(CHANNEL_V) - vstore8(data.s02468ACE, 0, dst.ptr); -#endif /* CHANNEL_U or CHANNEL_V */ -} - -/** This function extracts a given plane from an multi-planar image. - * - * @param[in] src_ptr Pointer to the source image. Supported Format: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void copy_plane( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Copy plane data - vstore8(vload8(0, src.ptr), 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl deleted file mode 100644 index cbebc88668..0000000000 --- a/src/core/CL/cl_kernels/color_convert.cl +++ /dev/null @@ -1,1911 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Convert an RGB888 image to RGBX8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void RGB888_to_RGBA8888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 16 pixels every time - uchar16 rgb_0 = vload16(0, in.ptr); - uchar16 rgb_1 = vload16(0, in.ptr + 16); - uchar16 rgb_2 = vload16(0, in.ptr + 32); - - uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255); - uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.sf, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255); - uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255); - uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255); - - vstore16(rgba_0, 0, out.ptr); - vstore16(rgba_1, 0, out.ptr + 16); - vstore16(rgba_2, 0, out.ptr + 32); - vstore16(rgba_3, 0, out.ptr + 48); -} - -/** Convert an RGB888 image to U8 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: RGB888 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void RGB888_to_U8_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 16 pixels every time - const uchar16 rgb_0 = vload16(0, in.ptr); - const uchar16 rgb_1 = vload16(0, in.ptr + 16); - const uchar16 rgb_2 = vload16(0, in.ptr + 32); - - //Resequence values from a sequence of 16 RGB values to sequence of 16 R, 16 G, 16 B values - const uchar16 rgb_r = (uchar16)(rgb_0.s0369, rgb_0.scf, rgb_1.s258b, rgb_1.se, rgb_2.s147a, rgb_2.sd); - const uchar16 rgb_g = (uchar16)(rgb_0.s147a, rgb_0.sd, rgb_1.s0369, rgb_1.scf, rgb_2.s258b, rgb_2.se); - const uchar16 rgb_b = (uchar16)(rgb_0.s258b, rgb_0.se, rgb_1.s147a, rgb_1.sd, rgb_2.s0369, rgb_2.scf); - - const float16 rgb2u8_red_coef_bt709 = 0.2126f; - const float16 rgb2u8_green_coef_bt709 = 0.7152f; - const float16 rgb2u8_blue_coef_bt709 = 0.0722f; - - //Computation of 16 greyscale values in float - const float16 greyscale_f_0 = rgb2u8_red_coef_bt709 * convert_float16(rgb_r) + rgb2u8_green_coef_bt709 * convert_float16(rgb_g) + rgb2u8_blue_coef_bt709 * convert_float16(rgb_b); - - //Convert it to 16 grayscale uchar values - const uchar16 greyscale_u8_0 = convert_uchar16_sat_rtz(greyscale_f_0); - - vstore16(greyscale_u8_0, 0, out.ptr); -} - -/** Convert an RGB888 image to RGBX8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void RGBA8888_to_RGB888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - // handle 16 pixels every time - uchar16 rgba_0 = vload16(0, in.ptr); - uchar16 rgba_1 = vload16(0, in.ptr + 16); - uchar16 rgba_2 = vload16(0, in.ptr + 32); - uchar16 rgba_3 = vload16(0, in.ptr + 48); - - uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124); - uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689); - uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde); - - vstore16(rgb_0, 0, out.ptr); - vstore16(rgb_1, 0, out.ptr + 16); - vstore16(rgb_2, 0, out.ptr + 32); -} - -/** Convert a UYVY422 image to RGB888 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void UYVY422_to_RGB888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 8 pixels every time - uchar16 uyvy = vload16(0, in.ptr); - - uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); - char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128); - char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128); - - float8 red_coef_bt709 = (float8)(1.5748f); - float8 green_coef_bt709 = (float8)(-0.1873f); - float8 green_coef2_bt709 = (float8)(-0.4681f); - float8 blue_coef_bt709 = (float8)(1.8556f); - float8 lumav = convert_float8(luma); - - float8 f_r = red_coef_bt709 * convert_float8(cr); - float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr); - float8 f_b = blue_coef_bt709 * convert_float8(cb); - - f_r += lumav; - f_g += lumav; - f_b += lumav; - - uchar8 r_0 = convert_uchar8_sat_rtz(f_r); - uchar8 g_0 = convert_uchar8_sat_rtz(f_g); - uchar8 b_0 = convert_uchar8_sat_rtz(f_b); - - uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2, - r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5); - uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7); - - vstore16(rgb_0, 0, out.ptr); - vstore8(rgb_1, 0, out.ptr + 16); -} - -/** Convert a UYVY422 image to RGBX8888 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void UYVY422_to_RGBA8888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 8 pixels every time - uchar16 uyvy = vload16(0, in.ptr); - - uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); - char8 cb = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128); - char8 cr = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128); - - float8 red_coef_bt709 = (float8)(1.5748f); - float8 green_coef_bt709 = (float8)(-0.1873f); - float8 green_coef2_bt709 = (float8)(-0.4681f); - float8 blue_coef_bt709 = (float8)(1.8556f); - float8 lumav = convert_float8(luma); - - float8 f_r = red_coef_bt709 * convert_float8(cr); - float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr); - float8 f_b = blue_coef_bt709 * convert_float8(cb); - - f_r += lumav; - f_g += lumav; - f_b += lumav; - - uchar8 r_0 = convert_uchar8_sat_rtz(f_r); - uchar8 g_0 = convert_uchar8_sat_rtz(f_g); - uchar8 b_0 = convert_uchar8_sat_rtz(f_b); - - uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255, - r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255, - r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255); - - vstore16(rgba_0, 0, out.ptr); - vstore16(rgba_1, 0, out.ptr + 16); -} - -/** Convert a YUYV422 image to RGB888 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void YUYV422_to_RGB888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 8 pixels every time - uchar16 uyvy = vload16(0, in.ptr); - - uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se); - char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128); - char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128); - - float8 red_coef_bt709 = (float8)(1.5748f); - float8 green_coef_bt709 = (float8)(-0.1873f); - float8 green_coef2_bt709 = (float8)(-0.4681f); - float8 blue_coef_bt709 = (float8)(1.8556f); - float8 lumav = convert_float8(luma); - - float8 f_r = red_coef_bt709 * convert_float8(cr); - float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr); - float8 f_b = blue_coef_bt709 * convert_float8(cb); - - f_r += lumav; - f_g += lumav; - f_b += lumav; - - uchar8 r_0 = convert_uchar8_sat_rtz(f_r); - uchar8 g_0 = convert_uchar8_sat_rtz(f_g); - uchar8 b_0 = convert_uchar8_sat_rtz(f_b); - - uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2, - r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5); - uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7); - - vstore16(rgb_0, 0, out.ptr); - vstore8(rgb_1, 0, out.ptr + 16); -} - -/** Convert a YUYV422 image to RGBX8888 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void YUYV422_to_RGBA8888_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - // handle 8 pixels every time - uchar16 uyvy = vload16(0, in.ptr); - - uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se); - char8 cb = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128); - char8 cr = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128); - - float8 red_coef_bt709 = (float8)(1.5748f); - float8 green_coef_bt709 = (float8)(-0.1873f); - float8 green_coef2_bt709 = (float8)(-0.4681f); - float8 blue_coef_bt709 = (float8)(1.8556f); - float8 lumav = convert_float8(luma); - - float8 f_r = red_coef_bt709 * convert_float8(cr); - float8 f_g = green_coef_bt709 * convert_float8(cb) + green_coef2_bt709 * convert_float8(cr); - float8 f_b = blue_coef_bt709 * convert_float8(cb); - - f_r += lumav; - f_g += lumav; - f_b += lumav; - - uchar8 r_0 = convert_uchar8_sat_rtz(f_r); - uchar8 g_0 = convert_uchar8_sat_rtz(f_g); - uchar8 b_0 = convert_uchar8_sat_rtz(f_b); - - uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255, - r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255, - r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255); - - vstore16(rgba_0, 0, out.ptr); - vstore16(rgba_1, 0, out.ptr + 16); -} - -/** Convert a RGB image to NV12 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel - * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8 - * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes) - * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel - * - */ -__kernel void RGB888_to_NV12_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(luma), - IMAGE_DECLARATION(uv)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv); - - // handle 4 pixels every time, two lines, each line for 2 pixels - // Read 2 pixel of the first line - uchar8 rgb_0 = vload8(0, in.ptr); - uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3); - uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4); - uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5); - - float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); - float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); - float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); - - short2 i_y = convert_short2_rtz(f_y); - short2 i_u = convert_short2_rtz(f_u) + (short2)(128); - short2 i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_0, 0, out_y.ptr); - - uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - - // Read 2 pixel of the second line - uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y); - uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3); - uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4); - uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5); - - f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); - f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); - f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); - - i_y = convert_short2_rtz(f_y); - i_u = convert_short2_rtz(f_u) + (short2)(128); - i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_1, 0, out_y.ptr + luma_stride_y); - - uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), - ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); - - vstore2(cbcr, 0, out_uv.ptr); -} - -/* - R'= Y' + 0.0000*U + 1.5748*V - G'= Y' - 0.1873*U - 0.4681*V - B'= Y' + 1.8556*U + 0.0000*V -*/ - -/** Convert an NV12 image to RGB888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void NV12_to_RGB888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(rgb_output)) -{ - Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_luma.ptr); - uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y); - uchar4 cbcr = vload4(0, in_uv.ptr); - char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); - char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore4(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); - vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); -} - -/** Convert a RGB image to YUV444 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void RGB888_to_YUV444_bt709( - IMAGE_DECLARATION(rgb_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - // handle 4 pixels every time - Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // Read 4 pixel - uchar16 rgb_0 = vload16(0, in_rgb.ptr); - uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9); - uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa); - uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb); - - float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0); - float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0); - float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0); - - short4 i_y = convert_short4_rtz(f_y); - short4 i_u = convert_short4_rtz(f_u) + (short4)(128); - short4 i_v = convert_short4_rtz(f_v) + (short4)(128); - - uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255)))); - vstore4(luma_0, 0, out_y.ptr); - - uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255)))); - uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255)))); - vstore4(cb_0, 0, out_u.ptr); - vstore4(cr_0, 0, out_v.ptr); -} - -/** Convert a RGB image to IYUV using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 2), height ] - * No offset. - * - * @param[in] rgb_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] rgb_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] rgb_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] rgb_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void RGB888_to_IYUV_bt709( - IMAGE_DECLARATION(rgb_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - // handle 4 pixels every time, two lines, each line for 2 pixels - Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // Read 2 pixel of the first line - uchar8 rgb_0 = vload8(0, in_rgb.ptr); - uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s3); - uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s4); - uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s5); - - float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); - float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); - float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); - - short2 i_y = convert_short2_rtz(f_y); - short2 i_u = convert_short2_rtz(f_u) + (short2)(128); - short2 i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_0, 0, out_y.ptr); - - uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - - // Read 2 pixel of the second line - uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y); - uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s3); - uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s4); - uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s5); - - f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); - f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); - f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); - - i_y = convert_short2_rtz(f_y); - i_u = convert_short2_rtz(f_u) + (short2)(128); - i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y); - - uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), - ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); - *out_u.ptr = cbcr.x; - *out_v.ptr = cbcr.y; -} - -/** Convert a RGBA image to YUV444 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] rgba_input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] rgba_input_step_y rgb_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination image V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void RGBA8888_to_YUV444_bt709( - IMAGE_DECLARATION(rgba_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - // handle 4 pixels every time - Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // Read 4 pixel - uchar16 rgb_0 = vload16(0, in_rgba.ptr); - uchar4 r_0 = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc); - uchar4 g_0 = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd); - uchar4 b_0 = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se); - - float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0); - float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0); - float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0); - - short4 i_y = convert_short4(f_y); - short4 i_u = convert_short4(f_u) + (short4)(128); - short4 i_v = convert_short4(f_v) + (short4)(128); - - uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255)))); - vstore4(luma_0, 0, out_y.ptr); - - uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255)))); - uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255)))); - vstore4(cb_0, 0, out_u.ptr); - vstore4(cr_0, 0, out_v.ptr); -} - -/** Convert a RGBA image to NV12 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 2), height ] - * No offset. - * - * @param[in] input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel - * @param[out] uv_output_ptr Pointer to the destination uv channel. Supported Format: U8 - * @param[in] uv_output_stride_x Stride of the destination uv channel in X dimension (in bytes) - * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_output_stride_y Stride of the destination image uv channel in Y dimension (in bytes) - * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination image uv channel - * - */ -__kernel void RGBA8888_to_NV12_bt709( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(uv_output)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output); - - // Read 2 pixel of the first line - uchar8 rgb_0 = vload8(0, in.ptr); - uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4); - uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5); - uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6); - - float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); - float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); - float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); - - short2 i_y = convert_short2_rtz(f_y); - short2 i_u = convert_short2_rtz(f_u) + (short2)(128); - short2 i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_0, 0, out_y.ptr); - - uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - - // Read 2 pixel of the second line - uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y); - uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4); - uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5); - uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6); - - f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); - f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); - f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); - - i_y = convert_short2_rtz(f_y); - i_u = convert_short2_rtz(f_u) + (short2)(128); - i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y); - - uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), - ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); - vstore2(cbcr, 0, out_uv.ptr); -} - -/** Convert a RGBA image to IYUV using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 2), height ] - * No offset. - * - * @param[in] rgba_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] rgba_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] rgba_input_step_x rgba_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgba_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] rgba_input_step_y rgba_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgba_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void RGBA8888_to_IYUV_bt709( - IMAGE_DECLARATION(rgba_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - // handle 4 pixels every time, two lines, each line for 2 pixels - Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // Read 2 pixel of the first line - uchar8 rgb_0 = vload8(0, in_rgb.ptr); - uchar2 r_0 = (uchar2)(rgb_0.s0, rgb_0.s4); - uchar2 g_0 = (uchar2)(rgb_0.s1, rgb_0.s5); - uchar2 b_0 = (uchar2)(rgb_0.s2, rgb_0.s6); - - float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0); - float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0); - float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0); - - short2 i_y = convert_short2_rtz(f_y); - short2 i_u = convert_short2_rtz(f_u) + (short2)(128); - short2 i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_0, 0, out_y.ptr); - - uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - - // Read 2 pixel of the second line - uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y); - uchar2 r_1 = (uchar2)(rgb_1.s0, rgb_1.s4); - uchar2 g_1 = (uchar2)(rgb_1.s1, rgb_1.s5); - uchar2 b_1 = (uchar2)(rgb_1.s2, rgb_1.s6); - - f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1); - f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1); - f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1); - - i_y = convert_short2_rtz(f_y); - i_u = convert_short2_rtz(f_u) + (short2)(128); - i_v = convert_short2_rtz(f_v) + (short2)(128); - - uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255)))); - vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y); - - uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255)))); - uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255)))); - uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4), - ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4)); - *out_u.ptr = cbcr.x; - *out_v.ptr = cbcr.y; -} - -/** Convert an NV12 image to RGB8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void NV12_to_RGBA8888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(rgb_output)) -{ - Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); - - uchar4 luma_0 = vload4(0, in_luma.ptr); - uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y); - uchar4 cbcr = vload4(0, in_uv.ptr); - char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); - char4 cr = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore8(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); - vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); -} - -/** Convert an NV12 image to IYUV - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - */ -__kernel void NV12_to_IYUV_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar16 cbcr = vload16(0, in_uv.ptr); - uchar8 cb = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se); - uchar8 cr = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore8(cb, 0, out_u.ptr); - vstore8(cr, 0, out_v.ptr); -} - -/** Convert an NV12 image to YUV444 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - */ -__kernel void NV12_to_YUV444_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar16 cbcr = vload16(0, in_uv.ptr); - uchar16 cb = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8, - cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se); - uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9, - cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore16(cb, 0, out_u.ptr); - vstore16(cb, 0, out_u.ptr + u_output_stride_y); - vstore16(cr, 0, out_v.ptr); - vstore16(cr, 0, out_v.ptr + v_output_stride_y); -} - -/** Convert an NV21 image to RGB888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgb_output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void NV21_to_RGB888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(rgb_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_y.ptr); - uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y); - uchar4 cbcr = vload4(0, in_uv.ptr); - char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); - char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore4(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); - vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); -} - -/** Convert an NV12 image to RGB8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void NV21_to_RGBA8888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(rgba_output)) -{ - Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_luma.ptr); - uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y); - uchar4 cbcr = vload4(0, in_uv.ptr); - char4 cr = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128); - char4 cb = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore8(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y); - vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8); -} - -/** Convert an NV21 image to YUV444 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - */ -__kernel void NV21_to_YUV444_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar16 cbcr = vload16(0, in_uv.ptr); - uchar16 cr = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8, - cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se); - uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9, - cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore16(cb, 0, out_u.ptr); - vstore16(cb, 0, out_u.ptr + u_output_stride_y); - vstore16(cr, 0, out_v.ptr); - vstore16(cr, 0, out_v.ptr + v_output_stride_y); -} - -/** Convert an NV21 image to IYUV - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] uv_input_ptr Pointer to the source uv channel. Supported Format: U8 - * @param[in] uv_input_stride_x Stride of the source image uv channel in X dimension (in bytes) - * @param[in] uv_input_step_x uv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uv_input_step_y uv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - */ -__kernel void NV21_to_IYUV_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(uv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar16 cbcr = vload16(0, in_uv.ptr); - uchar8 cr = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se); - uchar8 cb = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore8(cb, 0, out_u.ptr); - vstore8(cr, 0, out_v.ptr); -} - -/** Convert a UYVY image to IYUV using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] uyvy_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] uyvy_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] uyvy_input_step_x uyvy_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uyvy_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] uyvy_input_step_y uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uyvy_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void UYVY422_to_IYUV_bt709( - IMAGE_DECLARATION(uyvy_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 16 pixels every time, each line 8 pixels - uchar16 uyvy = vload16(0, in_uyvy.ptr); - uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); - ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc); - ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se); - vstore8(luma, 0, out_y.ptr); - - uyvy = vload16(0, in_uyvy.ptr + uyvy_input_stride_y); - luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf); - ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc); - ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se); - vstore8(luma, 0, out_y.ptr + luma_output_stride_y); - - uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2)); - uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2)); - vstore4(cb, 0, out_u.ptr); - vstore4(cr, 0, out_v.ptr); -} - -/** Convert a YUYV image to IYUV using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void YUYV422_to_IYUV_bt709( - IMAGE_DECLARATION(yuyv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 16 pixels every time, each line 8 pixels - uchar16 yuyv = vload16(0, in_yuyv.ptr); - uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); - ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd); - ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf); - vstore8(luma, 0, out_y.ptr); - - yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y); - luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); - ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd); - ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf); - vstore8(luma, 0, out_y.ptr + luma_output_stride_y); - - uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2)); - uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2)); - vstore4(cb, 0, out_u.ptr); - vstore4(cr, 0, out_v.ptr); -} - -/** Convert an IYUV image to RGB888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 - * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) - * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel - * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 - * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) - * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) - * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel - * @param[out] rgb_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgb_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgb_output_step_x rgb_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgb_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgb_output_step_y rgb_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void IYUV_to_RGB888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(u_input), - IMAGE_DECLARATION(v_input), - IMAGE_DECLARATION(rgb_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); - Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_y.ptr); - uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y); - uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr)); - char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128); - char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore4(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2); - rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3); - vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y); - vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8); -} - -/** Convert an IYUV image to RGB8888 - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 - * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) - * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel - * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 - * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) - * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) - * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel - * @param[out] rgba_output_ptr Pointer to the destination image. Supported Format: U8 - * @param[in] rgba_output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] rgba_output_step_x rgba_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] rgba_output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] rgba_output_step_y rgba_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void IYUV_to_RGBA8888_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(u_input), - IMAGE_DECLARATION(v_input), - IMAGE_DECLARATION(rgba_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); - Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); - Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output); - - // handle 8 pixels every time, two lines, each line for 4 pixels - uchar4 luma_0 = vload4(0, in_y.ptr); - uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y); - uchar4 cbcr = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr)); - char4 cb = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128); - char4 cr = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128); - - float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr); - float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr); - float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr); - - float4 f_r = convert_float4(luma_0) + temp0; - float4 f_g = convert_float4(luma_0) + temp1; - float4 f_b = convert_float4(luma_0) + temp2; - - uchar4 r_0 = convert_uchar4_sat_rtz(f_r); - uchar4 g_0 = convert_uchar4_sat_rtz(f_g); - uchar4 b_0 = convert_uchar4_sat_rtz(f_b); - - uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr); - vstore8(rgb_1, 0, out_rgb.ptr + 8); - - f_r = convert_float4(luma_1) + temp0; - f_g = convert_float4(luma_1) + temp1; - f_b = convert_float4(luma_1) + temp2; - - r_0 = convert_uchar4_sat_rtz(f_r); - g_0 = convert_uchar4_sat_rtz(f_g); - b_0 = convert_uchar4_sat_rtz(f_b); - - rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255); - rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255); - vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y); - vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8); -} - -/** Convert an IYUV image to YUV444 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 - * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) - * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel - * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 - * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) - * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) - * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] u_output_ptr Pointer to the destination U channel. Supported Format: U8 - * @param[in] u_output_stride_x Stride of the destination U channel in X dimension (in bytes) - * @param[in] u_output_step_x u_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] u_output_step_y u_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_output_offset_first_element_in_bytes The offset of the first element in the destination U channel - * @param[out] v_output_ptr Pointer to the destination V channel. Supported Format: U8 - * @param[in] v_output_stride_x Stride of the destination V channel in X dimension (in bytes) - * @param[in] v_output_step_x v_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_output_stride_y Stride of the destination V channel in Y dimension (in bytes) - * @param[in] v_output_step_y v_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_output_offset_first_element_in_bytes The offset of the first element in the destination V channel - * - */ -__kernel void IYUV_to_YUV444_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(u_input), - IMAGE_DECLARATION(v_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(u_output), - IMAGE_DECLARATION(v_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); - Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output); - Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar8 cb_src = vload8(0, in_u.ptr); - uchar8 cr_src = vload8(0, in_v.ptr); - uchar16 cb = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3, - cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7); - uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3, - cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore16(cb, 0, out_u.ptr); - vstore16(cb, 0, out_u.ptr + u_output_stride_y); - vstore16(cr, 0, out_v.ptr); - vstore16(cr, 0, out_v.ptr + v_output_stride_y); -} - -/** Convert an IYUV image to NV12 - * - * Global Workgroup Size [ DIV_CEIL(width, 16), height ] - * No offset. - * - * @param[in] luma_input_ptr Pointer to the source luma channel. Supported Format: U8 - * @param[in] luma_input_stride_x Stride of the luma image in X dimension (in bytes) - * @param[in] luma_input_step_x luma_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_input_stride_y Stride of the source luma channel in Y dimension (in bytes) - * @param[in] luma_input_step_y luma_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] u_input_ptr Pointer to the source U channel. Supported Format: U8 - * @param[in] u_input_stride_x Stride of the source image U channel in X dimension (in bytes) - * @param[in] u_input_step_x u_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] u_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] u_input_step_y u_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] u_input_offset_first_element_in_bytes The offset of the first element in the source U channel - * @param[in] v_input_ptr Pointer to the source V channel. Supported Format: U8 - * @param[in] v_input_stride_x Stride of the source image V channel in X dimension (in bytes) - * @param[in] v_input_step_x v_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] v_input_stride_y Stride of the source image V channel in Y dimension (in bytes) - * @param[in] v_input_step_y v_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] v_input_offset_first_element_in_bytes The offset of the first element in the source image V channel - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8 - * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes) - * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_output_stride_y Stride of the destination image U channel in Y dimension (in bytes) - * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel - * - */ -__kernel void IYUV_to_NV12_bt709( - IMAGE_DECLARATION(luma_input), - IMAGE_DECLARATION(u_input), - IMAGE_DECLARATION(v_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(uv_output)) -{ - Image in_y = CONVERT_TO_IMAGE_STRUCT(luma_input); - Image in_u = CONVERT_TO_IMAGE_STRUCT(u_input); - Image in_v = CONVERT_TO_IMAGE_STRUCT(v_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output); - - // handle 32 pixels every time, two lines, each line for 16 pixels - uchar16 luma_0 = vload16(0, in_y.ptr); - uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y); - uchar8 cb = vload8(0, in_u.ptr); - uchar8 cr = vload8(0, in_v.ptr); - uchar16 cbcr = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6, - cr.s6, cb.s7, cr.s7); - - vstore16(luma_0, 0, out_y.ptr); - vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y); - vstore16(cbcr, 0, out_uv.ptr); -} - -/** Convert a YUYV image to NV12 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * No offset. - * - * @param[in] yuyv_input_ptr Pointer to the source image. Supported Format: U8 - * @param[in] yuyv_input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] yuyv_input_step_x yuyv_input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] yuyv_input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] yuyv_input_step_y yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] yuyv_input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_output_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_output_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_output_step_x luma_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_output_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_output_step_y luma_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel - * @param[out] uv_output_ptr Pointer to the destination UV channel. Supported Format: U8 - * @param[in] uv_output_stride_x Stride of the destination UV channel in X dimension (in bytes) - * @param[in] uv_output_step_x uv_output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_output_stride_y Stride of the destination image UV channel in Y dimension (in bytes) - * @param[in] uv_output_step_y uv_output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_output_offset_first_element_in_bytes The offset of the first element in the destination UV channel - * - */ -__kernel void YUYV422_to_NV12_bt709( - IMAGE_DECLARATION(yuyv_input), - IMAGE_DECLARATION(luma_output), - IMAGE_DECLARATION(uv_output)) -{ - Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output); - - // handle 16 pixels every time, each line 8 pixels - uchar16 yuyv = vload16(0, in_yuyv.ptr); - ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf); - uchar8 luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); - vstore8(luma, 0, out_y.ptr); - - yuyv = vload16(0, in_yuyv.ptr + yuyv_input_stride_y); - ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf); - luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se); - vstore8(luma, 0, out_y.ptr + luma_output_stride_y); - - uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2)); - vstore8(cbcr, 0, out_uv.ptr); -} - -/** Convert a UYVY image to NV12 using BT709 color space - * - * Global Workgroup Size [ DIV_CEIL(width, 4), height ] - * No offset. - * - * @param[in] input_uyvy_ptr Pointer to the source image. Supported Format: U8 - * @param[in] input_uyvy_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_uyvy_step_x input_uyvy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_uyvy_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_uyvy_step_y input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] luma_ptr Pointer to the destination luma channel. Supported Format: U8 - * @param[in] luma_stride_x Stride of the destination luma channel in X dimension (in bytes) - * @param[in] luma_step_x luma_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] luma_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] luma_step_y luma_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] luma_offset_first_element_in_bytes The offset of the first element in the destination image luma channel - * @param[out] uv_ptr Pointer to the destination uv channel. Supported Format: U8 - * @param[in] uv_stride_x Stride of the destination uv channel in X dimension (in bytes) - * @param[in] uv_step_x uv_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] uv_stride_y Stride of the destination image luma channel in Y dimension (in bytes) - * @param[in] uv_step_y uv_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] uv_offset_first_element_in_bytes The offset of the first element in the destination image uv channel - * - */ -__kernel void UYVY422_to_NV12_bt709( - IMAGE_DECLARATION(input_uyvy), - IMAGE_DECLARATION(luma), - IMAGE_DECLARATION(uv)) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input_uyvy); - Image out_y = CONVERT_TO_IMAGE_STRUCT(luma); - Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv); - - // handle 16 pixels every time, each line 8 pixels - const uchar16 uyvy_t = vload16(0, in.ptr); - vstore8(uyvy_t.s13579bdf, 0, out_y.ptr); - - const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y); - vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y); - - const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se); - const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se); - const uchar8 cbcr = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2)); - vstore8(cbcr, 0, out_uv.ptr); -} diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl deleted file mode 100644 index 7bca567b11..0000000000 --- a/src/core/CL/cl_kernels/convolution3x3.cl +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2016-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] left_pixel Pointer to the left pixel. - * @param[in] left_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right_coeff Weight of the right pixel - * - * @return a short8 containing 8 convoluted values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel, - const short left_coeff, - const short middle_coeff, - const short right_coeff) -{ - uchar16 temp = vload16(0, left_pixel); - VEC_DATA_TYPE(DATA_TYPE, 8) - left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); - - return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff; -} - -/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result. - * - * Convolution matrix layout: - * - * [ mat0, mat1, mat2 ]\n - * [ mat3, mat4, mat5 ]\n - * [ mat6, mat7, mat8 ]\n - * - * @param[in] src A pointer to source Image structure - * @param[in] mat0 Coefficient from the convolution matrix - * @param[in] mat1 Coefficient from the convolution matrix - * @param[in] mat2 Coefficient from the convolution matrix - * @param[in] mat3 Coefficient from the convolution matrix - * @param[in] mat4 Coefficient from the convolution matrix - * @param[in] mat5 Coefficient from the convolution matrix - * @param[in] mat6 Coefficient from the convolution matrix - * @param[in] mat7 Coefficient from the convolution matrix - * @param[in] mat8 Coefficient from the convolution matrix - * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) - * - * @return a short8 containing 8 convoluted and scaled values. - */ -inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3( - Image *src, - const short mat0, const short mat1, const short mat2, - const short mat3, const short mat4, const short mat5, - const short mat6, const short mat7, const short mat8, uint scale) -{ - // Output pixels - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels; - - // Row 0 - pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2); - // Row - pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5); - // Row 2 - pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8); - - // Divide by the scale - return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale; -} - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image. - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n - * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution3x3_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = convolution3x3(&src, - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE); - - // Store the result as is in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl deleted file mode 100644 index 9995ebfa90..0000000000 --- a/src/core/CL/cl_kernels/convolution5x5.cl +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (c) 2016-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef COMPUTE_TYPE -#define COMPUTE_TYPE int -#endif /* COMPUTE_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] left_pixel Pointer to the left pixel - * @param[in] left1_coeff Weight of the most left pixel - * @param[in] left2_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right1_coeff Weight of the right pixel - * @param[in] right2_coeff Weight of the most right pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(DATA_TYPE, 8) -convolution1x5( - __global const uchar *left_pixel, - const short left1_coeff, - const short left2_coeff, - const short middle_coeff, - const short right1_coeff, - const short right2_coeff) -{ - uchar16 temp = vload16(0, left_pixel); - - VEC_DATA_TYPE(DATA_TYPE, 8) - left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); - - return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff - + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff; -} - -/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] src Pointer to source image. - * @param[in] up1_coeff Weight of the most up pixel - * @param[in] up2_coeff Weight of the up pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] down1_coeff Weight of the down pixel - * @param[in] down2_coeff Weight of the most down pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(COMPUTE_TYPE, 8) -convolution5x1( - Image *src, - const short up1_coeff, - const short up2_coeff, - const short middle_coeff, - const short down1_coeff, - const short down2_coeff) -{ - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - val; - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; - - return out; -} - -/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result. - * - * Convolution matrix layout:\n - * [ mat0, mat1, mat2, mat3 , mat4 ]\n - * [ mat5, mat6, mat7, mat8, mat9 ]\n - * [ mat10, mat11, mat12, mat13, mat14 ]\n - * [ mat15, mat16, mat17, mat18, mat19 ]\n - * [ mat20, mat21, mat22, mat23, mat24 ] - * - * @param[in] src A pointer to source Image structure. - * @param[in] mat0 Coefficient from the convolution matrix - * @param[in] mat1 Coefficient from the convolution matrix - * @param[in] mat2 Coefficient from the convolution matrix - * @param[in] mat3 Coefficient from the convolution matrix - * @param[in] mat4 Coefficient from the convolution matrix - * @param[in] mat5 Coefficient from the convolution matrix - * @param[in] mat6 Coefficient from the convolution matrix - * @param[in] mat7 Coefficient from the convolution matrix - * @param[in] mat8 Coefficient from the convolution matrix - * @param[in] mat9 Coefficient from the convolution matrix - * @param[in] mat10 Coefficient from the convolution matrix - * @param[in] mat11 Coefficient from the convolution matrix - * @param[in] mat12 Coefficient from the convolution matrix - * @param[in] mat13 Coefficient from the convolution matrix - * @param[in] mat14 Coefficient from the convolution matrix - * @param[in] mat15 Coefficient from the convolution matrix - * @param[in] mat16 Coefficient from the convolution matrix - * @param[in] mat17 Coefficient from the convolution matrix - * @param[in] mat18 Coefficient from the convolution matrix - * @param[in] mat19 Coefficient from the convolution matrix - * @param[in] mat20 Coefficient from the convolution matrix - * @param[in] mat21 Coefficient from the convolution matrix - * @param[in] mat22 Coefficient from the convolution matrix - * @param[in] mat23 Coefficient from the convolution matrix - * @param[in] mat24 Coefficient from the convolution matrix - * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) - * - * @return a short8 containing 8 convoluted and scaled values. - */ -short8 convolution5x5( - Image *src, - const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, - const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, - const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, - const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, - const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, - uint scale) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels; - - pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4); - pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9); - pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14); - pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19); - pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24); - - if(scale > 0) - { - pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; - } - - return convert_short8_sat(pixels); -} - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32). - * - * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n - * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable1x5_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4); - - // Store result in dst - vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); -} - -/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image. - * - * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable5x1_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9); - - // Divide by the scale - pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; - - // Store result in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution5x5_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - short8 pixels = convolution5x5(&src, - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, - MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE); - - // Store the result as is in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl deleted file mode 100644 index 50fb3d7f35..0000000000 --- a/src/core/CL/cl_kernels/convolution7x7.cl +++ /dev/null @@ -1,338 +0,0 @@ -/* - * Copyright (c) 2016-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef COMPUTE_TYPE -#define COMPUTE_TYPE int -#endif /* COMPUTE_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] left_pixel Pointer to the left pixel - * @param[in] left1_coeff Weight of the most left pixel - * @param[in] left2_coeff Weight of the second left pixel - * @param[in] left3_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right1_coeff Weight of the right pixel - * @param[in] right2_coeff Weight of the second right pixel - * @param[in] right3_coeff Weight of the most right pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(DATA_TYPE, 8) -convolution1x7( - __global const uchar *left_pixel, - const short left1_coeff, - const short left2_coeff, - const short left3_coeff, - const short middle_coeff, - const short right1_coeff, - const short right2_coeff, - const short right3_coeff) -{ - uchar16 temp = vload16(0, left_pixel); - - VEC_DATA_TYPE(DATA_TYPE, 8) - left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8)); - - return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, - 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff; -} - -/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] src Pointer to source image. - * @param[in] up1_coeff Weight of the most up pixel - * @param[in] up2_coeff Weight of the second up pixel - * @param[in] up3_coeff Weight of the up pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] down1_coeff Weight of the down pixel - * @param[in] down2_coeff Weight of the second down pixel - * @param[in] down3_coeff Weight of the third down pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(COMPUTE_TYPE, 8) -convolution7x1( - Image *src, - const short up1_coeff, - const short up2_coeff, - const short up3_coeff, - const short middle_coeff, - const short down1_coeff, - const short down2_coeff, - const short down3_coeff) -{ - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - val; - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff; - - return out; -} - -/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result. - * - * Convolution matrix layout:\n - * [ mat0, mat1, mat2, mat3 , mat4, mat5, mat6 ]\n - * [ mat7, mat8, mat9, mat10, mat11, mat12, mat13 ]\n - * [ mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n - * [ mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n - * [ mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n - * [ mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n - * [ mat42, mat43, mat44, mat45, mat46, mat47, mat48 ] - * - * @param[in] src A pointer to source Image structure. - * @param[in] mat0 Coefficient from the convolution matrix - * @param[in] mat1 Coefficient from the convolution matrix - * @param[in] mat2 Coefficient from the convolution matrix - * @param[in] mat3 Coefficient from the convolution matrix - * @param[in] mat4 Coefficient from the convolution matrix - * @param[in] mat5 Coefficient from the convolution matrix - * @param[in] mat6 Coefficient from the convolution matrix - * @param[in] mat7 Coefficient from the convolution matrix - * @param[in] mat8 Coefficient from the convolution matrix - * @param[in] mat9 Coefficient from the convolution matrix - * @param[in] mat10 Coefficient from the convolution matrix - * @param[in] mat11 Coefficient from the convolution matrix - * @param[in] mat12 Coefficient from the convolution matrix - * @param[in] mat13 Coefficient from the convolution matrix - * @param[in] mat14 Coefficient from the convolution matrix - * @param[in] mat15 Coefficient from the convolution matrix - * @param[in] mat16 Coefficient from the convolution matrix - * @param[in] mat17 Coefficient from the convolution matrix - * @param[in] mat18 Coefficient from the convolution matrix - * @param[in] mat19 Coefficient from the convolution matrix - * @param[in] mat20 Coefficient from the convolution matrix - * @param[in] mat21 Coefficient from the convolution matrix - * @param[in] mat22 Coefficient from the convolution matrix - * @param[in] mat23 Coefficient from the convolution matrix - * @param[in] mat24 Coefficient from the convolution matrix - * @param[in] mat25 Coefficient from the convolution matrix - * @param[in] mat26 Coefficient from the convolution matrix - * @param[in] mat27 Coefficient from the convolution matrix - * @param[in] mat28 Coefficient from the convolution matrix - * @param[in] mat29 Coefficient from the convolution matrix - * @param[in] mat30 Coefficient from the convolution matrix - * @param[in] mat31 Coefficient from the convolution matrix - * @param[in] mat32 Coefficient from the convolution matrix - * @param[in] mat33 Coefficient from the convolution matrix - * @param[in] mat34 Coefficient from the convolution matrix - * @param[in] mat35 Coefficient from the convolution matrix - * @param[in] mat36 Coefficient from the convolution matrix - * @param[in] mat37 Coefficient from the convolution matrix - * @param[in] mat38 Coefficient from the convolution matrix - * @param[in] mat39 Coefficient from the convolution matrix - * @param[in] mat40 Coefficient from the convolution matrix - * @param[in] mat41 Coefficient from the convolution matrix - * @param[in] mat42 Coefficient from the convolution matrix - * @param[in] mat43 Coefficient from the convolution matrix - * @param[in] mat44 Coefficient from the convolution matrix - * @param[in] mat45 Coefficient from the convolution matrix - * @param[in] mat46 Coefficient from the convolution matrix - * @param[in] mat47 Coefficient from the convolution matrix - * @param[in] mat48 Coefficient from the convolution matrix - * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) - * - */ -short8 convolution7x7( - Image *src, - const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, - const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, - const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, - const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, - const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, - const short mat25, const short mat26, const short mat27, const short mat28, const short mat29, - const short mat30, const short mat31, const short mat32, const short mat33, const short mat34, - const short mat35, const short mat36, const short mat37, const short mat38, const short mat39, - const short mat40, const short mat41, const short mat42, const short mat43, const short mat44, - const short mat45, const short mat46, const short mat47, const short mat48, uint scale) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels; - - pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6); - pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13); - pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20); - pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27); - pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34); - pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41); - pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48); - - if(scale > 0) - { - pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; - } - - return convert_short8_sat(pixels); -} - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image. - * - * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n - * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable1x7_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6); - - // Store result in dst - vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); -} - -/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image. - * - * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable7x1_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13); - - // Divide by the scale - pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; - - // Store result in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders. - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution7x7_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - short8 pixels = convolution7x7(&src, - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, - MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, - MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37, - MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE); - - // Clamp results to [ 0, 255 ] and store them in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl deleted file mode 100644 index 7e77c61fea..0000000000 --- a/src/core/CL/cl_kernels/convolution9x9.cl +++ /dev/null @@ -1,403 +0,0 @@ -/* - * Copyright (c) 2016-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef COMPUTE_TYPE -#define COMPUTE_TYPE int -#endif /* COMPUTE_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] left_pixel Pointer to the left pixel - * @param[in] left1_coeff Weight of the most left pixel - * @param[in] left2_coeff Weight of the second left pixel - * @param[in] left3_coeff Weight of the third left pixel - * @param[in] left4_coeff Weight of the left pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] right1_coeff Weight of the right pixel - * @param[in] right2_coeff Weight of the second right pixel - * @param[in] right3_coeff Weight of the third right pixel - * @param[in] right4_coeff Weight of the most right pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(DATA_TYPE, 8) -convolution1x9( - __global const uchar *left_pixel, - const short left1_coeff, - const short left2_coeff, - const short left3_coeff, - const short left4_coeff, - const short middle_coeff, - const short right1_coeff, - const short right2_coeff, - const short right3_coeff, - const short right4_coeff) -{ - uchar16 temp = vload16(0, left_pixel); - - VEC_DATA_TYPE(DATA_TYPE, 8) - left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8)); - VEC_DATA_TYPE(DATA_TYPE, 8) - right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8)); - - return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE, - 8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, - 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff; -} - -/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] src Pointer to source image. - * @param[in] up1_coeff Weight of the most up pixel - * @param[in] up2_coeff Weight of the second up pixel - * @param[in] up3_coeff Weight of the third up pixel - * @param[in] up4_coeff Weight of the up pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] down1_coeff Weight of the down pixel - * @param[in] down2_coeff Weight of the second down pixel - * @param[in] down3_coeff Weight of the third down pixel - * @param[in] down4_coeff Weight of the most down pixel - * - * @return a short8 containing 8 convoluted values. - */ -VEC_DATA_TYPE(COMPUTE_TYPE, 8) -convolution9x1( - Image *src, - const short up1_coeff, - const short up2_coeff, - const short up3_coeff, - const short up4_coeff, - const short middle_coeff, - const short down1_coeff, - const short down2_coeff, - const short down3_coeff, - const short down4_coeff) -{ - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - val; - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff; - - val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8)); - out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff; - - return out; -} - -/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result. - * - * Convolution matrix layout:\n - * [ mat0, mat1, mat2, mat3 , mat4, mat5, mat6, mat7, mat8 ]\n - * [ mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n - * [ mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n - * [ mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n - * [ mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n - * [ mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n - * [ mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ] - * [ mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ] - * [ mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ] - * - * @param[in] src A pointer to source Image structure. - * @param[in] mat0 Coefficient from the convolution matrix - * @param[in] mat1 Coefficient from the convolution matrix - * @param[in] mat2 Coefficient from the convolution matrix - * @param[in] mat3 Coefficient from the convolution matrix - * @param[in] mat4 Coefficient from the convolution matrix - * @param[in] mat5 Coefficient from the convolution matrix - * @param[in] mat6 Coefficient from the convolution matrix - * @param[in] mat7 Coefficient from the convolution matrix - * @param[in] mat8 Coefficient from the convolution matrix - * @param[in] mat9 Coefficient from the convolution matrix - * @param[in] mat10 Coefficient from the convolution matrix - * @param[in] mat11 Coefficient from the convolution matrix - * @param[in] mat12 Coefficient from the convolution matrix - * @param[in] mat13 Coefficient from the convolution matrix - * @param[in] mat14 Coefficient from the convolution matrix - * @param[in] mat15 Coefficient from the convolution matrix - * @param[in] mat16 Coefficient from the convolution matrix - * @param[in] mat17 Coefficient from the convolution matrix - * @param[in] mat18 Coefficient from the convolution matrix - * @param[in] mat19 Coefficient from the convolution matrix - * @param[in] mat20 Coefficient from the convolution matrix - * @param[in] mat21 Coefficient from the convolution matrix - * @param[in] mat22 Coefficient from the convolution matrix - * @param[in] mat23 Coefficient from the convolution matrix - * @param[in] mat24 Coefficient from the convolution matrix - * @param[in] mat25 Coefficient from the convolution matrix - * @param[in] mat26 Coefficient from the convolution matrix - * @param[in] mat27 Coefficient from the convolution matrix - * @param[in] mat28 Coefficient from the convolution matrix - * @param[in] mat29 Coefficient from the convolution matrix - * @param[in] mat30 Coefficient from the convolution matrix - * @param[in] mat31 Coefficient from the convolution matrix - * @param[in] mat32 Coefficient from the convolution matrix - * @param[in] mat33 Coefficient from the convolution matrix - * @param[in] mat34 Coefficient from the convolution matrix - * @param[in] mat35 Coefficient from the convolution matrix - * @param[in] mat36 Coefficient from the convolution matrix - * @param[in] mat37 Coefficient from the convolution matrix - * @param[in] mat38 Coefficient from the convolution matrix - * @param[in] mat39 Coefficient from the convolution matrix - * @param[in] mat40 Coefficient from the convolution matrix - * @param[in] mat41 Coefficient from the convolution matrix - * @param[in] mat42 Coefficient from the convolution matrix - * @param[in] mat43 Coefficient from the convolution matrix - * @param[in] mat44 Coefficient from the convolution matrix - * @param[in] mat45 Coefficient from the convolution matrix - * @param[in] mat46 Coefficient from the convolution matrix - * @param[in] mat47 Coefficient from the convolution matrix - * @param[in] mat48 Coefficient from the convolution matrix - * @param[in] mat49 Coefficient from the convolution matrix - * @param[in] mat50 Coefficient from the convolution matrix - * @param[in] mat51 Coefficient from the convolution matrix - * @param[in] mat52 Coefficient from the convolution matrix - * @param[in] mat53 Coefficient from the convolution matrix - * @param[in] mat54 Coefficient from the convolution matrix - * @param[in] mat55 Coefficient from the convolution matrix - * @param[in] mat56 Coefficient from the convolution matrix - * @param[in] mat57 Coefficient from the convolution matrix - * @param[in] mat58 Coefficient from the convolution matrix - * @param[in] mat59 Coefficient from the convolution matrix - * @param[in] mat60 Coefficient from the convolution matrix - * @param[in] mat61 Coefficient from the convolution matrix - * @param[in] mat62 Coefficient from the convolution matrix - * @param[in] mat63 Coefficient from the convolution matrix - * @param[in] mat64 Coefficient from the convolution matrix - * @param[in] mat65 Coefficient from the convolution matrix - * @param[in] mat66 Coefficient from the convolution matrix - * @param[in] mat67 Coefficient from the convolution matrix - * @param[in] mat68 Coefficient from the convolution matrix - * @param[in] mat69 Coefficient from the convolution matrix - * @param[in] mat70 Coefficient from the convolution matrix - * @param[in] mat71 Coefficient from the convolution matrix - * @param[in] mat72 Coefficient from the convolution matrix - * @param[in] mat73 Coefficient from the convolution matrix - * @param[in] mat74 Coefficient from the convolution matrix - * @param[in] mat75 Coefficient from the convolution matrix - * @param[in] mat76 Coefficient from the convolution matrix - * @param[in] mat77 Coefficient from the convolution matrix - * @param[in] mat78 Coefficient from the convolution matrix - * @param[in] mat79 Coefficient from the convolution matrix - * @param[in] mat80 Coefficient from the convolution matrix - * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0) - * - */ -short8 convolution9x9( - Image *src, - const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, - const short mat5, const short mat6, const short mat7, const short mat8, const short mat9, - const short mat10, const short mat11, const short mat12, const short mat13, const short mat14, - const short mat15, const short mat16, const short mat17, const short mat18, const short mat19, - const short mat20, const short mat21, const short mat22, const short mat23, const short mat24, - const short mat25, const short mat26, const short mat27, const short mat28, const short mat29, - const short mat30, const short mat31, const short mat32, const short mat33, const short mat34, - const short mat35, const short mat36, const short mat37, const short mat38, const short mat39, - const short mat40, const short mat41, const short mat42, const short mat43, const short mat44, - const short mat45, const short mat46, const short mat47, const short mat48, const short mat49, - const short mat50, const short mat51, const short mat52, const short mat53, const short mat54, - const short mat55, const short mat56, const short mat57, const short mat58, const short mat59, - const short mat60, const short mat61, const short mat62, const short mat63, const short mat64, - const short mat65, const short mat66, const short mat67, const short mat68, const short mat69, - const short mat70, const short mat71, const short mat72, const short mat73, const short mat74, - const short mat75, const short mat76, const short mat77, const short mat78, const short mat79, - const short mat80, uint scale) -{ - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels; - - pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8); - pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17); - pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26); - pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35); - pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44); - pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53); - pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62); - pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71); - pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80); - - if(scale > 0) - { - pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale; - } - - return convert_short8_sat(pixels); -} - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image. - * - * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n - * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16, S16, S32 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable1x9_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8); - - // Store result in dst - vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); -} - -/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image. - * - * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U16, S16, S32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_separable9x1_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Output pixels - VEC_DATA_TYPE(COMPUTE_TYPE, 8) - pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17); - - // Divide by the scale - pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE; - - // Store result in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution9x9_static( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - short8 pixels = convolution9x9(&src, - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, - MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, - MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37, - MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49, - MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, - MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73, - MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE); - - // Store the result as is in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr); -} - -#endif // DYNAMIC_MATRIX_CONVOLUTION diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl deleted file mode 100644 index 925a698628..0000000000 --- a/src/core/CL/cl_kernels/convolution_rectangle.cl +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "convolution3x3.cl" -#include "convolution5x5.cl" -#include "convolution7x7.cl" -#include "convolution9x9.cl" -#include "helpers.h" - -#define MAT_INDEX(i) MAT##i - -#ifndef DATA_TYPE -#define DATA_TYPE short -#endif /* DATA_TYPE */ - -#ifndef COMPUTE_TYPE -#define COMPUTE_TYPE int -#endif /* COMPUTE_TYPE */ - -#ifndef DATA_TYPE_OUT -#define DATA_TYPE_OUT uchar -#endif /* DATA_TYPE_OUT */ - -#ifndef DYNAMIC_MATRIX_CONVOLUTION - -/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders - * - * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n - * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8, S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void convolution_rectangle( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - short matrix_coeff[81] = - { - MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, - MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17, - MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26, - MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, - MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, - MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53, - MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62, - MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, - MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80 - }; - - VEC_DATA_TYPE(DATA_TYPE, 8) - pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0; - - for(int i = 0; i < MATRIX_HEIGHT; i++) - { -#if MATRIX_WIDTH == 3 - pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3], - matrix_coeff[2 + i * 3]); -#endif /* MATRIX_WIDTH */ - -#if MATRIX_WIDTH == 5 - pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5], - matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]); -#endif /* MATRIX_WIDTH */ - -#if MATRIX_WIDTH == 7 - pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7], - matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7], - matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]); -#endif /* MATRIX_WIDTH */ - -#if MATRIX_WIDTH == 9 - pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9], - matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9], - matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]); -#endif /* MATRIX_WIDTH */ - } - - pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE; - - // Store the result as is in dst - vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr)); -} - -#endif /* not DYNAMIC_MATRIX_CONVOLUTION */ diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl deleted file mode 100644 index dddbb4d615..0000000000 --- a/src/core/CL/cl_kernels/derivative.cl +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This OpenCL kernel that computes the first-order derivative. - * - * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient - * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void derivative( - IMAGE_DECLARATION(src) -#ifdef GRAD_X - , - IMAGE_DECLARATION(dst_gx) -#endif /* GRAD_X */ -#ifdef GRAD_Y - , - IMAGE_DECLARATION(dst_gy) -#endif /* GRAD_Y */ -) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); -#ifdef GRAD_X - Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); -#endif /* GRAD_X */ -#ifdef GRAD_Y - Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); -#endif /* GRAD_Y */ - -#ifdef GRAD_X - short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0))); - short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0))); - vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr)); -#endif /* GRAD_X */ -#ifdef GRAD_Y - short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1))); - short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1))); - vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr)); -#endif /* GRAD_Y */ -} diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl deleted file mode 100644 index 14362c1f31..0000000000 --- a/src/core/CL/cl_kernels/dilate.cl +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function dilates an input image. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void dilate( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 top = vload16(0, offset(&src, -1, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar16 bottom = vload16(0, offset(&src, -1, 1)); - - uchar16 tmp = max(top, max(middle, bottom)); - uchar8 out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789)); - - vstore8(out, 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl deleted file mode 100644 index 810c5fc51a..0000000000 --- a/src/core/CL/cl_kernels/erode.cl +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function erodes an input image image. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void erode( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uchar16 top = vload16(0, offset(&src, -1, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar16 bottom = vload16(0, offset(&src, -1, 1)); - - uchar16 tmp = min(top, min(middle, bottom)); - uchar8 out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789)); - - vstore8(out, 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl deleted file mode 100644 index 89c144ab5e..0000000000 --- a/src/core/CL/cl_kernels/fast_corners.cl +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "types.h" - -/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P. - * - * . . F 0 1 . . . - * . E . . . 2 . . - * D . . . . . 3 . - * C . . P . . 4 . - * B . . . . . 5 . - * . A . . . 6 . . - * . . 9 8 7 . . . - */ -constant int offsets_s[16][2] = -{ - { 0, -3 }, // 0 - { 1, -3 }, // 1 - { 2, -2 }, // 2 - { 3, -1 }, // 3 - { 3, 0 }, // 4 - { 3, 1 }, // 5 - { 2, 2 }, // 6 - { 1, 3 }, // 7 - { 0, 3 }, // 8 - { -1, 3 }, // 9 - { -2, 2 }, // A - { -3, 1 }, // B - { -3, 0 }, // C - { -3, -1 }, // D - { -2, -2 }, // E - { -1, -3 }, // F -}; - -/** Load a pixel and set the mask values. - * - * @param[in] ptr The pointer to the starting address of source image - * @param[in] a Index to indicate the position in the Bresenham circle - * @param[in] stride Stride of source image in x dimension - * @param[in] dark The left end of the threshold range - * @param[in] bright The right end of the threshold range - * @param[out] dark_mask The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark - * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright - * - */ -#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \ - { \ - unsigned char pixel; \ - pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]); \ - dark_mask |= (pixel < dark) << a; \ - bright_mask |= (pixel > bright) << a; \ - } - -/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark. - * - * @param[in] bright_mask The mask recording postions of bright pixels - * @param[in] dark_mask The mask recording postions of dark pixels - * @param[out] isCorner Indicate whether candidate pixel is corner - */ -#define CHECK_CORNER(bright_mask, dark_mask, isCorner) \ - { \ - for(int i = 0; i < 16; i++) \ - { \ - isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \ - isCorner |= ((dark_mask & 0x1FF) == 0x1FF); \ - if(isCorner) \ - { \ - break; \ - } \ - bright_mask >>= 1; \ - dark_mask >>= 1; \ - } \ - } - -/* Calculate pixel's strength */ -uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold) -{ - short a = threshold; - short b = 255; - while(b - a > 1) - { - uchar c = convert_uchar_sat((a + b) / 2); - unsigned int bright_mask = 0; - unsigned int dark_mask = 0; - - unsigned char p_bright = add_sat(candidate_pixel, c); - unsigned char p_dark = sub_sat(candidate_pixel, c); - - bool isCorner = 0; - - for(uint i = 0; i < 16; i++) - { - LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask) - } - - bright_mask |= (bright_mask << 16); - dark_mask |= (dark_mask << 16); - CHECK_CORNER(bright_mask, dark_mask, isCorner); - - if(isCorner) - { - a = convert_short(c); - } - else - { - b = convert_short(c); - } - } - return a; -} - -/** Fast corners implementation. Calculates and returns the strength of each pixel. - * - * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright - * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits - * from the LSB are set. - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] output_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] output_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] threshold_value Threshold value. - * - */ -__kernel void fast_corners( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output), - float threshold_value) -{ - Image in = CONVERT_TO_IMAGE_STRUCT(input); - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - const unsigned char threshold = (uchar)threshold_value; - - unsigned int bright_mask = 0; - unsigned int dark_mask = 0; - - unsigned char isCorner = 0; - - unsigned char p = *in.ptr; - unsigned char p_bright = add_sat(p, threshold); - unsigned char p_dark = sub_sat(p, threshold); - - LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - - if(((bright_mask | dark_mask) & 0x1111) == 0) - { - *out.ptr = 0; - return; - } - - LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask) - - bright_mask |= (bright_mask << 16); - dark_mask |= (dark_mask << 16); - - CHECK_CORNER(bright_mask, dark_mask, isCorner) - - if(!isCorner) - { - *out.ptr = 0; - return; - } - -#ifdef USE_MAXSUPPRESSION - *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold); -#else /* USE_MAXSUPPRESSION */ - *out.ptr = 1; -#endif /* USE_MAXSUPPRESSION */ -} - -/** Copy result to Keypoint buffer and count number of corners - * - * @param[in] input_ptr Pointer to the image with calculated strenghs. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] max_num_points The maximum number of keypoints the array can hold - * @param[out] offset The number of skipped pixels in x dimension - * @param[out] num_of_points Number of points found - * @param[out] out The keypoints found - * - */ -__kernel void copy_to_keypoint( - IMAGE_DECLARATION(input), - uint max_num_points, - uint offset, - __global uint *num_of_points, - __global Keypoint *out) -{ -#ifndef UPDATE_NUMBER - if(*num_of_points >= max_num_points) - { - return; - } -#endif /* UPDATE_NUMBER */ - - Image in = CONVERT_TO_IMAGE_STRUCT(input); - - uchar value = *in.ptr; - - if(value > 0) - { - int id = atomic_inc(num_of_points); - if(id < max_num_points) - { - out[id].strength = value; - out[id].x = get_global_id(0) + offset; - out[id].y = get_global_id(1) + offset; - out[id].tracking_status = 1; - out[id].scale = 0.f; - out[id].orientation = 0.f; - out[id].error = 0.f; - } - } -} diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl deleted file mode 100644 index ae2c31a848..0000000000 --- a/src/core/CL/cl_kernels/gaussian_pyramid.cl +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction - * - * @note Each thread computes 8 pixels - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void gaussian1x5_sub_x( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values for the convolution (20 bytes needed) - uchar16 temp0 = vload16(0, src.ptr); - uchar4 temp1 = vload4(0, src.ptr + 16); - - // Convert to USHORT8 - ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE)); - ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF)); - ushort8 m_data = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0)); - ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1)); - ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02)); - - // Compute convolution along the X direction - ushort8 pixels = l2_data + r2_data; - pixels += l1_data * (ushort8)4; - pixels += m_data * (ushort8)6; - pixels += r1_data * (ushort8)4; - - // Store result - vstore8(pixels, 0, (__global ushort *)dst.ptr); -} - -/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction - * - * @note Each thread computes 8 pixels - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void gaussian5x1_sub_y( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0)); - ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1)); - ushort8 m_data = vload8(0, (__global ushort *)offset(&src, 0, 2)); - ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3)); - ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4)); - - // Compute convolution along the Y direction - ushort8 pixels = u2_data + d2_data; - pixels += u1_data * (ushort8)4; - pixels += m_data * (ushort8)6; - pixels += d1_data * (ushort8)4; - - // Scale result - pixels >>= (ushort8)8; - - // Store result - vstore8(convert_uchar8_sat(pixels), 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl deleted file mode 100644 index 3e3c9fd23c..0000000000 --- a/src/core/CL/cl_kernels/harris_corners.cl +++ /dev/null @@ -1,376 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Function running harris score on 3x3 block size - * - * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int. - * e.g. -DDATA_TYPE=short. - * - * @param[in] src_gx_ptr Pointer to the first source image. Supported data types: S16, S32 - * @param[in] src_gx_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_gx_step_x src_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gx_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_gx_step_y src_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gx_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_gy_ptr Pointer to the second source image. Supported data types: S16, S32 - * @param[in] src_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] src_gy_step_x src_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] src_gy_step_y src_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] vc_ptr Pointer to the destination image. Supported data types: F32 - * @param[in] vc_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] vc_step_x vc_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] vc_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] vc_step_y vc_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] vc_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation - * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores - * @param[in] pow4_normalization_factor Normalization factor to apply harris score - */ -__kernel void harris_score_3x3( - IMAGE_DECLARATION(src_gx), - IMAGE_DECLARATION(src_gy), - IMAGE_DECLARATION(vc), - float sensitivity, - float strength_thresh, - float pow4_normalization_factor) -{ - Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx); - Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy); - Image vc = CONVERT_TO_IMAGE_STRUCT(vc); - - /* Gx^2, Gy^2 and Gx*Gy */ - float4 gx2 = (float4)0.0f; - float4 gy2 = (float4)0.0f; - float4 gxgy = (float4)0.0f; - - /* Row0 */ - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1)); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1)); - - float4 l_gx = convert_float4(temp_gx.s0123); - float4 m_gx = convert_float4(temp_gx.s1234); - float4 r_gx = convert_float4(temp_gx.s2345); - - float4 l_gy = convert_float4(temp_gy.s0123); - float4 m_gy = convert_float4(temp_gy.s1234); - float4 r_gy = convert_float4(temp_gy.s2345); - - gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx); - gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy); - gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy); - - /* Row1 */ - temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0)); - temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0)); - - l_gx = convert_float4(temp_gx.s0123); - m_gx = convert_float4(temp_gx.s1234); - r_gx = convert_float4(temp_gx.s2345); - - l_gy = convert_float4(temp_gy.s0123); - m_gy = convert_float4(temp_gy.s1234); - r_gy = convert_float4(temp_gy.s2345); - - gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx); - gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy); - gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy); - - /* Row2 */ - temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1)); - temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1)); - - l_gx = convert_float4(temp_gx.s0123); - m_gx = convert_float4(temp_gx.s1234); - r_gx = convert_float4(temp_gx.s2345); - - l_gy = convert_float4(temp_gy.s0123); - m_gy = convert_float4(temp_gy.s1234); - r_gy = convert_float4(temp_gy.s2345); - - gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx); - gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy); - gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy); - - /* Compute trace and determinant */ - float4 trace = gx2 + gy2; - float4 det = gx2 * gy2 - (gxgy * gxgy); - - /* Compute harris score */ - float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor; - - mc = select(0.0f, mc, mc > (float4)strength_thresh); - - vstore4(mc, 0, (__global float *)vc.ptr); -} - -/** Function for calculating harris score 1x5. - * - * @param[in] src_gx Pointer to gx gradient image. - * @param[in] src_gy Pointer to gy gradient image. - * @param[in] row Relative row. - */ -inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row) -{ - float4 gx2 = 0.0f; - float4 gy2 = 0.0f; - float4 gxgy = 0.0f; - - /* Row */ - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row)); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row)); - - float4 gx = convert_float4(temp_gx.s0123); - float4 gy = convert_float4(temp_gy.s0123); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx.s1234); - gy = convert_float4(temp_gy.s1234); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx.s2345); - gy = convert_float4(temp_gy.s2345); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx.s3456); - gy = convert_float4(temp_gy.s3456); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx.s4567); - gy = convert_float4(temp_gy.s4567); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - return (float16)(gx2, gy2, gxgy, (float4)0); -} - -/** Function running harris score on 5x5 block size - * - * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int. - * e.g. -DDATA_TYPE=short. - * - * @param[in] src_gx_ptr Pointer to the first source image. Supported data types: S16, S32 - * @param[in] src_gx_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_gx_step_x src_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gx_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_gx_step_y src_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gx_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_gy_ptr Pointer to the second source image. Supported data types: S16, S32 - * @param[in] src_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] src_gy_step_x src_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] src_gy_step_y src_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] vc_ptr Pointer to the destination image. Supported data types: F32 - * @param[in] vc_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] vc_step_x vc_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] vc_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] vc_step_y vc_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] vc_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation - * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores - * @param[in] pow4_normalization_factor Normalization factor to apply harris score - */ -__kernel void harris_score_5x5( - IMAGE_DECLARATION(src_gx), - IMAGE_DECLARATION(src_gy), - IMAGE_DECLARATION(vc), - float sensitivity, - float strength_thresh, - float pow4_normalization_factor) -{ - Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx); - Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy); - Image vc = CONVERT_TO_IMAGE_STRUCT(vc); - - /* Gx^2, Gy^2 and Gx*Gy */ - float16 res = (float16)0.0f; - - /* Compute row */ - for(int i = -2; i < 3; i++) - { - res += harris_score_1x5(&src_gx, &src_gy, i); - } - - float4 gx2 = res.s0123; - float4 gy2 = res.s4567; - float4 gxgy = res.s89AB; - - /* Compute trace and determinant */ - float4 trace = gx2 + gy2; - float4 det = gx2 * gy2 - (gxgy * gxgy); - - /* Compute harris score */ - float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor; - - mc = select(0.0f, mc, mc > (float4)strength_thresh); - - vstore4(mc, 0, (__global float *)vc.ptr); -} - -/** Function for calculating harris score 1x7. - * - * @param[in] src_gx Pointer to gx gradient image. - * @param[in] src_gy Pointer to gy gradient image. - * @param[in] row Relative row. - */ -inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row) -{ - float4 gx2 = 0.0f; - float4 gy2 = 0.0f; - float4 gxgy = 0.0f; - - /* Row */ - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row)); - VEC_DATA_TYPE(DATA_TYPE, 8) - temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row)); - VEC_DATA_TYPE(DATA_TYPE, 2) - temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row)); - VEC_DATA_TYPE(DATA_TYPE, 2) - temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row)); - - float4 gx = convert_float4(temp_gx0.s0123); - float4 gy = convert_float4(temp_gy0.s0123); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx0.s1234); - gy = convert_float4(temp_gy0.s1234); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx0.s2345); - gy = convert_float4(temp_gy0.s2345); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx0.s3456); - gy = convert_float4(temp_gy0.s3456); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4(temp_gx0.s4567); - gy = convert_float4(temp_gy0.s4567); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0)); - gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0)); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01)); - gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01)); - gx2 += (gx * gx); - gy2 += (gy * gy); - gxgy += (gx * gy); - - return (float16)(gx2, gy2, gxgy, (float4)0); -} - -/** Function running harris score on 7x7 block size - * - * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int. - * e.g. -DDATA_TYPE=short. - * - * @param[in] src_gx_ptr Pointer to the first source image. Supported data types: S16, S32 - * @param[in] src_gx_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_gx_step_x src_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gx_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_gx_step_y src_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gx_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_gy_ptr Pointer to the second source image. Supported data types: S16, S32 - * @param[in] src_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] src_gy_step_x src_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] src_gy_step_y src_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] vc_ptr Pointer to the destination image. Supported data types: F32 - * @param[in] vc_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] vc_step_x vc_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] vc_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] vc_step_y vc_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] vc_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation - * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores - * @param[in] pow4_normalization_factor Normalization factor to apply harris score - */ -__kernel void harris_score_7x7( - IMAGE_DECLARATION(src_gx), - IMAGE_DECLARATION(src_gy), - IMAGE_DECLARATION(vc), - float sensitivity, - float strength_thresh, - float pow4_normalization_factor) -{ - Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx); - Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy); - Image vc = CONVERT_TO_IMAGE_STRUCT(vc); - - /* Gx^2, Gy^2 and Gx*Gy */ - float16 res = (float16)0.0f; - - /* Compute row */ - for(int i = -3; i < 4; i++) - { - res += harris_score_1x7(&src_gx, &src_gy, i); - } - - float4 gx2 = res.s0123; - float4 gy2 = res.s4567; - float4 gxgy = res.s89AB; - - /* Compute trace and determinant */ - float4 trace = gx2 + gy2; - float4 det = gx2 * gy2 - (gxgy * gxgy); - - /* Compute harris score */ - float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor; - - mc = select(0.0f, mc, mc > (float4)strength_thresh); - - vstore4(mc, 0, (__global float *)vc.ptr); -} diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl deleted file mode 100644 index a93cb4d1c7..0000000000 --- a/src/core/CL/cl_kernels/histogram.cl +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#define VATOMIC_INC16(histogram, win_pos) \ - { \ - atomic_inc(histogram + win_pos.s0); \ - atomic_inc(histogram + win_pos.s1); \ - atomic_inc(histogram + win_pos.s2); \ - atomic_inc(histogram + win_pos.s3); \ - atomic_inc(histogram + win_pos.s4); \ - atomic_inc(histogram + win_pos.s5); \ - atomic_inc(histogram + win_pos.s6); \ - atomic_inc(histogram + win_pos.s7); \ - atomic_inc(histogram + win_pos.s8); \ - atomic_inc(histogram + win_pos.s9); \ - atomic_inc(histogram + win_pos.sa); \ - atomic_inc(histogram + win_pos.sb); \ - atomic_inc(histogram + win_pos.sc); \ - atomic_inc(histogram + win_pos.sd); \ - atomic_inc(histogram + win_pos.se); \ - atomic_inc(histogram + win_pos.sf); \ - } - -/** Calculate the histogram of an 8 bit grayscale image. - * - * Each thread will process 16 pixels and use one local atomic operation per pixel. - * When all work items in a work group are done the resulting local histograms are - * added to the global histogram using global atomics. - * - * @note The input image is represented as a two-dimensional array of type uchar. - * The output is represented as a one-dimensional uint array of length of num_bins - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] histogram_local The local buffer to hold histogram result in per workgroup. Supported data types: U32 - * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 - * @param[out] num_bins The number of bins - * @param[out] offset The start of values to use (inclusive) - * @param[out] range The range of a bin - * @param[out] offrange The maximum value (exclusive) - */ -__kernel void hist_local_kernel(IMAGE_DECLARATION(input), - __local uint *histogram_local, - __global uint *restrict histogram, - uint num_bins, - uint offset, - uint range, - uint offrange) -{ - Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); - uint local_id_x = get_local_id(0); - - uint local_x_size = get_local_size(0); - - if(num_bins > local_x_size) - { - for(int i = local_id_x; i < num_bins; i += local_x_size) - { - histogram_local[i] = 0; - } - } - else - { - if(local_id_x <= num_bins) - { - histogram_local[local_id_x] = 0; - } - } - - uint16 vals = convert_uint16(vload16(0, input_buffer.ptr)); - - uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange)); - - barrier(CLK_LOCAL_MEM_FENCE); - VATOMIC_INC16(histogram_local, win_pos); - barrier(CLK_LOCAL_MEM_FENCE); - - if(num_bins > local_x_size) - { - for(int i = local_id_x; i < num_bins; i += local_x_size) - { - atomic_add(histogram + i, histogram_local[i]); - } - } - else - { - if(local_id_x <= num_bins) - { - atomic_add(histogram + local_id_x, histogram_local[local_id_x]); - } - } -} - -/** Calculate the histogram of an 8 bit grayscale image's border. - * - * Each thread will process one pixel using global atomic. - * When all work items in a work group are done the resulting local histograms are - * added to the global histogram using global atomics. - * - * @note The input image is represented as a two-dimensional array of type uchar. - * The output is represented as a one-dimensional uint array of length of num_bins - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 - * @param[out] num_bins The number of bins - * @param[out] offset The start of values to use (inclusive) - * @param[out] range The range of a bin - * @param[out] offrange The maximum value (exclusive) - */ -__kernel void hist_border_kernel(IMAGE_DECLARATION(input), - __global uint *restrict histogram, - uint num_bins, - uint offset, - uint range, - uint offrange) -{ - Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); - - uint val = (uint)(*input_buffer.ptr); - - uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0; - - if(val >= offset && (val < offrange)) - { - atomic_inc(histogram + win_pos); - } -} - -/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1. - * - * Each thread will process 16 pixels and use one local atomic operation per pixel. - * When all work items in a work group are done the resulting local histograms are - * added to the global histogram using global atomics. - * - * @note The input image is represented as a two-dimensional array of type uchar. - * The output is represented as a one-dimensional uint array of 256 elements - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] histogram_local The local buffer to hold histogram result in per workgroup. Supported data types: U32 - * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 - */ -__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input), - __local uint *histogram_local, - __global uint *restrict histogram) -{ - Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); - - uint local_index = get_local_id(0); - uint local_x_size = get_local_size(0); - - for(int i = local_index; i < 256; i += local_x_size) - { - histogram_local[i] = 0; - } - - uint16 vals = convert_uint16(vload16(0, input_buffer.ptr)); - - barrier(CLK_LOCAL_MEM_FENCE); - - atomic_inc(histogram_local + vals.s0); - atomic_inc(histogram_local + vals.s1); - atomic_inc(histogram_local + vals.s2); - atomic_inc(histogram_local + vals.s3); - atomic_inc(histogram_local + vals.s4); - atomic_inc(histogram_local + vals.s5); - atomic_inc(histogram_local + vals.s6); - atomic_inc(histogram_local + vals.s7); - atomic_inc(histogram_local + vals.s8); - atomic_inc(histogram_local + vals.s9); - atomic_inc(histogram_local + vals.sa); - atomic_inc(histogram_local + vals.sb); - atomic_inc(histogram_local + vals.sc); - atomic_inc(histogram_local + vals.sd); - atomic_inc(histogram_local + vals.se); - atomic_inc(histogram_local + vals.sf); - - barrier(CLK_LOCAL_MEM_FENCE); - - for(int i = local_index; i < 256; i += local_x_size) - { - atomic_add(histogram + i, histogram_local[i]); - } -} - -/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1. - * - * Each thread will process one pixel using global atomic. - * When all work items in a work group are done the resulting local histograms are - * added to the global histogram using global atomics. - * - * @note The input image is represented as a two-dimensional array of type uchar. - * The output is represented as a one-dimensional uint array of 256 - * - * @param[in] input_ptr Pointer to the first source image. Supported data types: U8 - * @param[in] input_stride_x Stride of the first source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] histogram The output buffer to hold histogram final result. Supported data types: U32 - */ -__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input), - __global uint *restrict histogram) -{ - Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input); - atomic_inc(histogram + *input_buffer.ptr); -} diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl deleted file mode 100644 index b14f361df6..0000000000 --- a/src/core/CL/cl_kernels/hog.cl +++ /dev/null @@ -1,456 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "types.h" - -#if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE) - -/** This OpenCL kernel computes the HOG orientation binning - * - * @attention The following variables must be passed at compile time: - * - * -# -DCELL_WIDTH = Width of the cell - * -# -DCELL_HEIGHT = height of the cell - * -# -DNUM_BINS = Number of bins for each cell - * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG - * - * @note Each work-item computes a single cell - * - * @param[in] mag_ptr Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16 - * @param[in] mag_stride_x Stride of the magnitude image in X dimension (in bytes) - * @param[in] mag_step_x mag_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] mag_stride_y Stride of the magnitude image in Y dimension (in bytes) - * @param[in] mag_step_y mag_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] mag_offset_first_element_in_bytes The offset of the first element in the magnitude image - * @param[in] phase_ptr Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8 - * @param[in] phase_stride_x Stride of the phase image in X dimension (in bytes) - * @param[in] phase_step_x phase_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] phase_stride_y Stride of the the phase image in Y dimension (in bytes) - * @param[in] phase_step_y phase_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] phase_offset_first_element_in_bytes The offset of the first element in the the phase image - * @param[out] dst_ptr Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag), - IMAGE_DECLARATION(phase), - IMAGE_DECLARATION(dst)) -{ - float bins[NUM_BINS] = { 0 }; - - // Compute address for the magnitude and phase images - Image mag = CONVERT_TO_IMAGE_STRUCT(mag); - Image phase = CONVERT_TO_IMAGE_STRUCT(phase); - - __global uchar *mag_row_ptr = mag.ptr; - __global uchar *phase_row_ptr = phase.ptr; - - for(int yc = 0; yc < CELL_HEIGHT; ++yc) - { - int xc = 0; - for(; xc <= (CELL_WIDTH - 4); xc += 4) - { - // Load magnitude and phase values - const float4 mag_f32 = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc)); - float4 phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc)); - - // Scale phase: phase * scale + 0.5f - phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE; - - // Compute histogram index. - int4 hidx_s32 = convert_int4(phase_f32); - - // Compute magnitude weights (w0 and w1) - const float4 hidx_f32 = convert_float4(hidx_s32); - - // w1 = phase_f32 - hidx_s32 - const float4 w1_f32 = phase_f32 - hidx_f32; - - // w0 = 1.0 - w1 - const float4 w0_f32 = (float4)1.0f - w1_f32; - - // Calculate the weights for splitting vote - const float4 mag_w0_f32 = mag_f32 * w0_f32; - const float4 mag_w1_f32 = mag_f32 * w1_f32; - - // Weighted vote between 2 bins - - // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0 - hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS)); - - // Bin 0 - bins[hidx_s32.s0] += mag_w0_f32.s0; - bins[hidx_s32.s1] += mag_w0_f32.s1; - bins[hidx_s32.s2] += mag_w0_f32.s2; - bins[hidx_s32.s3] += mag_w0_f32.s3; - - hidx_s32 += (int4)1; - - // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0 - hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS)); - - // Bin1 - bins[hidx_s32.s0] += mag_w1_f32.s0; - bins[hidx_s32.s1] += mag_w1_f32.s1; - bins[hidx_s32.s2] += mag_w1_f32.s2; - bins[hidx_s32.s3] += mag_w1_f32.s3; - } - - // Left over computation - for(; xc < CELL_WIDTH; xc++) - { - const float mag_value = *((__global short *)mag_row_ptr + xc); - const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f; - const float w1 = phase_value - floor(phase_value); - - // The quantised phase is the histogram index [0, NUM_BINS - 1] - // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0 - const uint hidx = (uint)(phase_value) % NUM_BINS; - - // Weighted vote between 2 bins - bins[hidx] += mag_value * (1.0f - w1); - bins[(hidx + 1) % NUM_BINS] += mag_value * w1; - } - - // Point to the next row of magnitude and phase images - mag_row_ptr += mag_stride_y; - phase_row_ptr += phase_stride_y; - } - - // Compute address for the destination image - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Store the local HOG in the global memory - int xc = 0; - for(; xc <= (NUM_BINS - 4); xc += 4) - { - float4 values = vload4(0, bins + xc); - - vstore4(values, 0, ((__global float *)dst.ptr) + xc); - } - - // Left over stores - for(; xc < NUM_BINS; ++xc) - { - ((__global float *)dst.ptr)[xc] = bins[xc]; - } -} -#endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */ - -#if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD) - -#ifndef L2_NORM -#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel -#endif /* not L2_NORM */ - -#ifndef L2HYS_NORM -#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel -#endif /* not L2HYS_NORM */ - -#ifndef L1_NORM -#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel -#endif /* not L1_NORM */ - -/** This OpenCL kernel computes the HOG block normalization - * - * @attention The following variables must be passed at compile time: - * - * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block - * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction - * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block - * -# -DHOG_NORM_TYPE = Normalization type - * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method - * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM - * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM - * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM - * - * @note Each work-item computes a single block - * - * @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void hog_block_normalization(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - float sum = 0.0f; - float4 sum_f32 = (float4)(0.0f); - - // Compute address for the source and destination tensor - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc) - { - const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y); - - int xc = 0; - for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16) - { - const float4 val0 = vload4(0, hist_ptr + xc + 0); - const float4 val1 = vload4(0, hist_ptr + xc + 4); - const float4 val2 = vload4(0, hist_ptr + xc + 8); - const float4 val3 = vload4(0, hist_ptr + xc + 12); - -#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) - // Compute val^2 for L2_NORM or L2HYS_NORM - sum_f32 += val0 * val0; - sum_f32 += val1 * val1; - sum_f32 += val2 * val2; - sum_f32 += val3 * val3; -#else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */ - // Compute |val| for L1_NORM - sum_f32 += fabs(val0); - sum_f32 += fabs(val1); - sum_f32 += fabs(val2); - sum_f32 += fabs(val3); -#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */ - - // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization. - // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values - // will be accessed consecutively - vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X); - vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X); - vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X); - vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X); - } - - // Compute left over - for(; xc < NUM_BINS_PER_BLOCK_X; ++xc) - { - const float val = hist_ptr[xc]; - -#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) - sum += val * val; -#else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */ - sum += fabs(val); -#endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */ - - ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val; - } - } - - sum += dot(sum_f32, (float4)1.0f); - - float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f); - -#if(HOG_NORM_TYPE == L2HYS_NORM) - // Reset sum - sum_f32 = (float4)0.0f; - sum = 0.0f; - - int k = 0; - for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16) - { - float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0); - float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4); - float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8); - float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12); - - // Scale val - val0 = val0 * (float4)scale; - val1 = val1 * (float4)scale; - val2 = val2 * (float4)scale; - val3 = val3 * (float4)scale; - - // Clip val if over _threshold_l2hys - val0 = fmin(val0, (float4)L2_HYST_THRESHOLD); - val1 = fmin(val1, (float4)L2_HYST_THRESHOLD); - val2 = fmin(val2, (float4)L2_HYST_THRESHOLD); - val3 = fmin(val3, (float4)L2_HYST_THRESHOLD); - - // Compute val^2 - sum_f32 += val0 * val0; - sum_f32 += val1 * val1; - sum_f32 += val2 * val2; - sum_f32 += val3 * val3; - - vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0); - vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4); - vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8); - vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12); - } - - // Compute left over - for(; k < NUM_BINS_PER_BLOCK; ++k) - { - float val = ((__global float *)dst.ptr)[k] * scale; - - // Clip scaled input_value if over L2_HYST_THRESHOLD - val = fmin(val, (float)L2_HYST_THRESHOLD); - - sum += val * val; - - ((__global float *)dst.ptr)[k] = val; - } - - sum += dot(sum_f32, (float4)1.0f); - - // We use the same constants of OpenCV - scale = 1.0f / (sqrt(sum) + 1e-3f); - -#endif /* (HOG_NORM_TYPE == L2HYS_NORM) */ - - int i = 0; - for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16) - { - float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0); - float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4); - float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8); - float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12); - - // Multiply val by the normalization scale factor - val0 = val0 * (float4)scale; - val1 = val1 * (float4)scale; - val2 = val2 * (float4)scale; - val3 = val3 * (float4)scale; - - vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0); - vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4); - vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8); - vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12); - } - - for(; i < NUM_BINS_PER_BLOCK; ++i) - { - ((__global float *)dst.ptr)[i] *= scale; - } -} -#endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */ - -#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT) - -/** This OpenCL kernel computes the HOG detector using linear SVM - * - * @attention The following variables must be passed at compile time: - * - * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction - * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction - * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane - * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array - * -# -DIDX_CLASS = Index of the class to detect - * -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction - * -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction - * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window - * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window - * - * @note Each work-item computes a single detection window - * - * @param[in] src_ptr Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] hog_descriptor Pointer to HOG descriptor. Supported data types: F32 - * @param[out] dst Pointer to DetectionWindow array - * @param[out] num_detection_windows Number of objects detected - */ -__kernel void hog_detector(IMAGE_DECLARATION(src), - __global float *hog_descriptor, - __global DetectionWindow *dst, - __global uint *num_detection_windows) -{ - // Check if the DetectionWindow array is full - if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS) - { - return; - } - - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - const int src_step_y_f32 = src_stride_y / sizeof(float); - - // Init score_f32 with 0 - float4 score_f32 = (float4)0.0f; - - // Init score with 0 - float score = 0.0f; - - __global float *src_row_ptr = (__global float *)src.ptr; - - // Compute Linear SVM - for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32) - { - int xb = 0; - - const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X; - - for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8) - { - // Load descriptor values - float4 a0_f32 = vload4(0, src_row_ptr + xb + 0); - float4 a1_f32 = vload4(0, src_row_ptr + xb + 4); - - float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y); - float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y); - - // Multiply accumulate - score_f32 += a0_f32 * b0_f32; - score_f32 += a1_f32 * b1_f32; - } - - for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb) - { - const float a = src_row_ptr[xb]; - const float b = hog_descriptor[xb + offset_y]; - - score += a * b; - } - } - - score += dot(score_f32, (float4)1.0f); - - // Add the bias. The bias is located at the position (descriptor_size() - 1) - // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y - score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y]; - - if(score > (float)THRESHOLD) - { - int id = atomic_inc(num_detection_windows); - if(id < MAX_NUM_DETECTION_WINDOWS) - { - dst[id].x = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH; - dst[id].y = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT; - dst[id].width = DETECTION_WINDOW_WIDTH; - dst[id].height = DETECTION_WINDOW_HEIGHT; - dst[id].idx_class = IDX_CLASS; - dst[id].score = score; - } - } -} -#endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS && - * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */ diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl deleted file mode 100644 index dd2c7982f4..0000000000 --- a/src/core/CL/cl_kernels/integral_image.cl +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function computes the horizontal integral of the image. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U32 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void integral_horizontal( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - uint prev = 0; - - for(uint j = 0; j < src_step_x; j += 16) - { - barrier(CLK_GLOBAL_MEM_FENCE); - uint16 res = convert_uint16(vload16(0, offset(&src, j, 0))); - res.s0 += prev; - res.s1 += res.s0; - res.s2 += res.s1; - res.s3 += res.s2; - res.s4 += res.s3; - res.s5 += res.s4; - res.s6 += res.s5; - res.s7 += res.s6; - res.s8 += res.s7; - res.s9 += res.s8; - res.sA += res.s9; - res.sB += res.sA; - res.sC += res.sB; - res.sD += res.sC; - res.sE += res.sD; - res.sF += res.sE; - prev = res.sF; - vstore16(res, 0, (__global uint *)offset(&dst, j, 0)); - } -} - -/** This function computes the vertical integral of the image. - * - * @param[in,out] src_ptr Pointer to the source image. Supported data types: U32 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] height Image height. - */ -__kernel void integral_vertical( - IMAGE_DECLARATION(src), - uint height) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0)); - for(uint j = 1; j < height; ++j) - { - barrier(CLK_GLOBAL_MEM_FENCE); - uint8 res = vload8(0, (__global uint *)offset(&src, 0, j)); - res += prev; - vstore8(res, 0, (__global uint *)offset(&src, 0, j)); - prev = res; - } -} diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl deleted file mode 100644 index 48197d6473..0000000000 --- a/src/core/CL/cl_kernels/magnitude_phase.cl +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Calculates L1 normalization between two inputs. - * - * @param[in] a First input. Supported data types: S16, S32 - * @param[in] b Second input. Supported data types: S16, S32 - * - * @return L1 normalization magnitude result. Supported data types: S16, S32 - */ -inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b) -{ - return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16)); -} - -/** Calculates L2 normalization between two inputs. - * - * @param[in] a First input. Supported data types: S16, S32 - * @param[in] b Second input. Supported data types: S16, S32 - * - * @return L2 normalization magnitude result. Supported data types: S16, S32 - */ -inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b) -{ - return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f), - VEC_DATA_TYPE(DATA_TYPE, 16)); -} - -/** Calculates unsigned phase between two inputs. - * - * @param[in] a First input. Supported data types: S16, S32 - * @param[in] b Second input. Supported data types: S16, S32 - * - * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8 - */ -inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b) -{ - float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f; - angle_deg_f32 = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f); - return convert_uchar16(angle_deg_f32); -} - -/** Calculates signed phase between two inputs. - * - * @param[in] a First input. Supported data types: S16, S32 - * @param[in] b Second input. Supported data types: S16, S32 - * - * @return Signed phase mapped in the interval [0, 256). Supported data types: U8 - */ -inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b) -{ - float16 arct = atan2pi(convert_float16(b), convert_float16(a)); - arct = select(arct, arct + 2, arct < 0.0f); - - return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & (int16)0xFFu); -} - -#if(1 == MAGNITUDE) -#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y)) -#elif(2 == MAGNITUDE) -#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y)) -#else /* MAGNITUDE */ -#define MAGNITUDE_OP(x, y) -#endif /* MAGNITUDE */ - -#if(1 == PHASE) -#define PHASE_OP(x, y) phase_unsigned((x), (y)) -#elif(2 == PHASE) -#define PHASE_OP(x, y) phase_signed((x), (y)) -#else /* PHASE */ -#define PHASE_OP(x, y) -#endif /* PHASE */ - -/** Calculate the magnitude and phase of given the gradients of an image. - * - * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2). - * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256). - * - * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1 - * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1 - * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int - * - * @param[in] gx_ptr Pointer to the first source image (gradient X). Supported data types: S16, S32 - * @param[in] gx_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] gx_step_x gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] gx_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] gx_step_y gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] gx_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] gy_ptr Pointer to the second source image (gradient Y) . Supported data types: S16, S32 - * @param[in] gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] gy_step_x gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] gy_step_y gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] magnitude_ptr Pointer to the magnitude destination image. Supported data types: S16, S32 - * @param[in] magnitude_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] magnitude_step_x magnitude_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] magnitude_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] magnitude_step_y magnitude_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] magnitude_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] phase_ptr Pointer to the phase destination image. Supported data types: U8 - * @param[in] phase_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] phase_step_x phase_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] phase_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] phase_step_y phase_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] phase_offset_first_element_in_bytes The offset of the first element in the destination image - * */ -__kernel void magnitude_phase( - IMAGE_DECLARATION(gx), - IMAGE_DECLARATION(gy) -#ifdef MAGNITUDE - , - IMAGE_DECLARATION(magnitude) -#endif /* MAGNITUDE */ -#ifdef PHASE - , - IMAGE_DECLARATION(phase) -#endif /* PHASE */ -) -{ - // Get pixels pointer - Image gx = CONVERT_TO_IMAGE_STRUCT(gx); - Image gy = CONVERT_TO_IMAGE_STRUCT(gy); - - // Load values - VEC_DATA_TYPE(DATA_TYPE, 16) - in_a = vload16(0, (__global DATA_TYPE *)gx.ptr); - VEC_DATA_TYPE(DATA_TYPE, 16) - in_b = vload16(0, (__global DATA_TYPE *)gy.ptr); - - // Calculate and store the results -#ifdef MAGNITUDE - Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude); - vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr); -#endif /* MAGNITUDE */ -#ifdef PHASE - Image phase = CONVERT_TO_IMAGE_STRUCT(phase); - vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr); -#endif /* PHASE */ -} diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl deleted file mode 100644 index 4ddf931e4b..0000000000 --- a/src/core/CL/cl_kernels/mean_stddev.cl +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2016-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable - -/** This function calculates the sum and sum of squares of a given input image. - * - * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] height Height of the input image - * @param[out] global_sum Global sum of all elements - * @param[out] global_sum_sq Global sum of squares of all elements - */ -__kernel void mean_stddev_accumulate( - IMAGE_DECLARATION(src), - uint height, - __global ulong *global_sum -#ifdef STDDEV - , - __global ulong *global_sum_sq -#endif /* STDDEV */ -) -{ - // Get pixels pointer - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - uint8 tmp_sum = 0; -#ifdef STDDEV - uint8 tmp_sum_sq = 0; -#endif /* STDDEV */ - // Calculate partial sum - for(int i = 0; i < height; i++) - { - // Load data - uint8 data = convert_uint8(vload8(0, offset(&src, 0, i))); - - tmp_sum += data; -#ifdef STDDEV - tmp_sum_sq += data * data; -#endif /* STDDEV */ - } - // Perform reduction - tmp_sum.s0123 += tmp_sum.s4567; - tmp_sum.s01 += tmp_sum.s23; - atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1); - -#ifdef STDDEV - tmp_sum_sq.s0123 += tmp_sum_sq.s4567; - tmp_sum_sq.s01 += tmp_sum_sq.s23; - atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1); -#endif /* STDDEV */ -} - -#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl deleted file mode 100644 index 1045f22fb1..0000000000 --- a/src/core/CL/cl_kernels/minmaxloc.cl +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "types.h" - -#ifndef DATA_TYPE_MIN -#define DATA_TYPE_MIN 0x0 -#endif /* DATA_TYPE_MIN */ - -#ifndef DATA_TYPE_MAX -#define DATA_TYPE_MAX 0xFF -#endif /* DATA_TYPE_MAX */ - -inline int FloatFlip(float val) -{ - union - { - int int_val; - float flt_val; - } u_val; - u_val.flt_val = val; - return (u_val.int_val >= 0) ? u_val.int_val : u_val.int_val ^ 0x7FFFFFFF; -} - -__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN); -__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX); -__constant int16 idx16 = (int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - -/** This function identifies the min and maximum value of an input image. - * - * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE. - * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively. - * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] min_max Pointer to buffer with minimum value in position 0 and maximum value in position 1 - * @param[in] width Input image width - */ -__kernel void minmax( - IMAGE_DECLARATION(src), - __global int *min_max, - int width) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - // Initialize local minimum and local maximum - VEC_DATA_TYPE(DATA_TYPE, 16) - local_min = type_max; - VEC_DATA_TYPE(DATA_TYPE, 16) - local_max = type_min; - - // Calculate min/max of row - int i = 0; - for(; i + 16 <= width; i += 16) - { - VEC_DATA_TYPE(DATA_TYPE, 16) - data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0)); - local_min = min(data, local_min); - local_max = max(data, local_max); - } - -#ifdef NON_MULTIPLE_OF_16 - // Handle non multiple of 16 - VEC_DATA_TYPE(DATA_TYPE, 16) - data = vload16(0, (__global DATA_TYPE *)offset(&src, i, 0)); -#ifdef IS_DATA_TYPE_FLOAT - int16 valid_indices = (i + idx16) < width; -#else /* IS_DATA_TYPE_FLOAT */ - VEC_DATA_TYPE(DATA_TYPE, 16) - valid_indices = CONVERT((i + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16)); -#endif /* IS_DATA_TYPE_FLOAT */ - local_max = max(local_max, select(type_min, data, valid_indices)); - local_min = min(local_min, select(type_max, data, valid_indices)); -#endif /* NON_MULTIPLE_OF_16 */ - - // Perform min/max reduction - local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF); - local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF); - - local_min.s0123 = min(local_min.s0123, local_min.s4567); - local_max.s0123 = max(local_max.s0123, local_max.s4567); - - local_min.s01 = min(local_min.s01, local_min.s23); - local_max.s01 = max(local_max.s01, local_max.s23); - - local_min.s0 = min(local_min.s0, local_min.s1); - local_max.s0 = max(local_max.s0, local_max.s1); - - // Update global min/max -#ifdef IS_DATA_TYPE_FLOAT - atomic_min(&min_max[0], FloatFlip(local_min.s0)); - atomic_max(&min_max[1], FloatFlip(local_max.s0)); -#else /* IS_DATA_TYPE_FLOAT */ - atomic_min(&min_max[0], local_min.s0); - atomic_max(&min_max[1], local_max.s0); -#endif /* IS_DATA_TYPE_FLOAT */ -} - -/** This function counts the min and max occurrences in an image and tags their position. - * - * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values. - * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] min_max Pointer to buffer with minimum value in position 0 and maximum value in position 1 - * @param[out] min_max_count Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1 - * @param[out] min_loc Array that holds the location of the minimum value occurrences - * @param[in] max_min_loc_count The maximum number of min value occurrences coordinates the array can hold - * @param[out] max_loc Array that holds the location of the maximum value occurrences - * @param[in] max_max_loc_count The maximum number of max value occurrences coordinates the array can hold - */ -__kernel void minmaxloc( - IMAGE_DECLARATION(src), - __global int *min_max, - __global uint *min_max_count -#ifdef LOCATE_MIN - , - __global Coordinates2D *min_loc, uint max_min_loc_count -#endif /* LOCATE_MIN */ -#ifdef LOCATE_MAX - , - __global Coordinates2D *max_loc, uint max_max_loc_count -#endif /* LOCATE_MAX */ -) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - -#ifdef IS_DATA_TYPE_FLOAT - __global float *min_max_ptr = (__global float *)min_max; - float min_value = min_max_ptr[0]; - float max_value = min_max_ptr[1]; -#else /* IS_DATA_TYPE_FLOAT */ - int min_value = min_max[0]; - int max_value = min_max[1]; -#endif /* IS_DATA_TYPE_FLOAT */ - - DATA_TYPE value = *((__global DATA_TYPE *)src.ptr); -#ifdef COUNT_MIN_MAX - if(value == min_value) - { - uint idx = atomic_inc(&min_max_count[0]); -#ifdef LOCATE_MIN - if(idx < max_min_loc_count) - { - min_loc[idx].x = get_global_id(0); - min_loc[idx].y = get_global_id(1); - } -#endif /* LOCATE_MIN */ - } - if(value == max_value) - { - uint idx = atomic_inc(&min_max_count[1]); -#ifdef LOCATE_MAX - if(idx < max_max_loc_count) - { - max_loc[idx].x = get_global_id(0); - max_loc[idx].y = get_global_id(1); - } -#endif /* LOCATE_MAX */ - } -#endif /* COUNT_MIN_MAX */ -} diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl deleted file mode 100644 index 93c5024c52..0000000000 --- a/src/core/CL/cl_kernels/non_linear_filter3x3.cl +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (c) 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "non_linear_filter_helpers.h" - -/** This function applies a non linear filter on a 3x3 box basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_box3x3( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar16 top = vload16(0, offset(&src, -1, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar16 bottom = vload16(0, offset(&src, -1, 1)); - - // Apply respective filter -#ifdef MIN - uchar16 tmp = min(top, min(middle, bottom)); - uchar8 out = row_reduce_min_3(tmp); -#elif defined(MAX) - uchar16 tmp = max(top, max(middle, bottom)); - uchar8 out = row_reduce_max_3(tmp); -#elif defined(MEDIAN) - uchar8 p0 = top.s01234567; - uchar8 p1 = top.s12345678; - uchar8 p2 = top.s23456789; - uchar8 p3 = middle.s01234567; - uchar8 p4 = middle.s12345678; - uchar8 p5 = middle.s23456789; - uchar8 p6 = bottom.s01234567; - uchar8 p7 = bottom.s12345678; - uchar8 p8 = bottom.s23456789; - uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} - -/** This function applies a non linear filter on a 3x3 cross basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_cross3x3( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar8 top = vload8(0, offset(&src, 0, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar8 bottom = vload8(0, offset(&src, 0, 1)); - - // Apply respective filter -#ifdef MIN - uchar8 tmp_middle = row_reduce_min_3(middle); - uchar8 out = min(tmp_middle, min(top, bottom)); -#elif defined(MAX) - uchar8 tmp_middle = row_reduce_max_3(middle); - uchar8 out = max(tmp_middle, max(top, bottom)); -#elif defined(MEDIAN) - uchar8 p0 = top.s01234567; - uchar8 p1 = middle.s01234567; - uchar8 p2 = middle.s12345678; - uchar8 p3 = middle.s23456789; - uchar8 p4 = bottom.s01234567; - uchar8 out = sort5(p0, p1, p2, p3, p4); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} - -/** This function applies a non linear filter on a 3x3 disk basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_disk3x3( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar16 top = vload16(0, offset(&src, -1, -1)); - uchar16 middle = vload16(0, offset(&src, -1, 0)); - uchar16 bottom = vload16(0, offset(&src, -1, 1)); - - // Apply respective filter -#ifdef MIN - uchar16 tmp = min(top, min(middle, bottom)); - uchar8 out = row_reduce_min_3(tmp); -#elif defined(MAX) - uchar16 tmp = max(top, max(middle, bottom)); - uchar8 out = row_reduce_max_3(tmp); -#elif defined(MEDIAN) - uchar8 p0 = top.s01234567; - uchar8 p1 = top.s12345678; - uchar8 p2 = top.s23456789; - uchar8 p3 = middle.s01234567; - uchar8 p4 = middle.s12345678; - uchar8 p5 = middle.s23456789; - uchar8 p6 = bottom.s01234567; - uchar8 p7 = bottom.s12345678; - uchar8 p8 = bottom.s23456789; - uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl deleted file mode 100644 index 7c87284a72..0000000000 --- a/src/core/CL/cl_kernels/non_linear_filter5x5.cl +++ /dev/null @@ -1,483 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "non_linear_filter_helpers.h" - -// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html - -/** Sorting network to sort 8 disks of diameter 5 and return their median. - * - * @param[in] top2 Values of elements two rows above. - * @param[in] top Values of elements one row above. - * @param[in] middle Values of middle elements. - * @param[in] bottom Values of elements one row below. - * @param[in] bottom2 Values of elements two rows below. - * - * @return Median values for 8 elements. - */ -inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2) -{ - uchar8 p0 = top2.s01234567; - uchar8 p1 = top2.s12345678; - uchar8 p2 = top2.s23456789; - uchar8 p3 = top.s01234567; - uchar8 p4 = top.s12345678; - uchar8 p5 = top.s23456789; - uchar8 p6 = top.s3456789A; - uchar8 p7 = top.s456789AB; - uchar8 p8 = middle.s01234567; - uchar8 p9 = middle.s12345678; - uchar8 p10 = middle.s23456789; - uchar8 p11 = middle.s3456789A; - uchar8 p12 = middle.s456789AB; - uchar8 p13 = bottom.s01234567; - uchar8 p14 = bottom.s12345678; - uchar8 p15 = bottom.s23456789; - uchar8 p16 = bottom.s3456789A; - uchar8 p17 = bottom.s456789AB; - uchar8 p18 = bottom2.s01234567; - uchar8 p19 = bottom2.s12345678; - uchar8 p20 = bottom2.s23456789; - - SORT(p0, p1); - SORT(p2, p3); - SORT(p4, p5); - SORT(p6, p7); - SORT(p8, p9); - SORT(p10, p11); - SORT(p12, p13); - SORT(p14, p15); - SORT(p16, p17); - SORT(p18, p19); - SORT(p0, p2); - SORT(p1, p3); - SORT(p4, p6); - SORT(p5, p7); - SORT(p8, p10); - SORT(p9, p11); - SORT(p12, p14); - SORT(p13, p15); - SORT(p16, p18); - SORT(p17, p19); - SORT(p1, p2); - SORT(p5, p6); - SORT(p0, p4); - SORT(p3, p7); - SORT(p9, p10); - SORT(p13, p14); - SORT(p8, p12); - SORT(p11, p15); - SORT(p17, p18); - SORT(p16, p20); - SORT(p1, p5); - SORT(p2, p6); - SORT(p9, p13); - SORT(p10, p14); - SORT(p0, p8); - SORT(p7, p15); - SORT(p17, p20); - SORT(p1, p4); - SORT(p3, p6); - SORT(p9, p12); - SORT(p11, p14); - SORT(p18, p20); - SORT(p0, p16); - SORT(p2, p4); - SORT(p3, p5); - SORT(p10, p12); - SORT(p11, p13); - SORT(p1, p9); - SORT(p6, p14); - SORT(p19, p20); - SORT(p3, p4); - SORT(p11, p12); - SORT(p1, p8); - SORT(p2, p10); - SORT(p5, p13); - SORT(p7, p14); - SORT(p3, p11); - SORT(p2, p8); - SORT(p4, p12); - SORT(p7, p13); - SORT(p1, p17); - SORT(p3, p10); - SORT(p5, p12); - SORT(p1, p16); - SORT(p2, p18); - SORT(p3, p9); - SORT(p6, p12); - SORT(p2, p16); - SORT(p3, p8); - SORT(p7, p12); - SORT(p5, p9); - SORT(p6, p10); - SORT(p4, p8); - SORT(p7, p11); - SORT(p3, p19); - SORT(p5, p8); - SORT(p7, p10); - SORT(p3, p18); - SORT(p4, p20); - SORT(p6, p8); - SORT(p7, p9); - SORT(p3, p17); - SORT(p5, p20); - SORT(p7, p8); - SORT(p3, p16); - SORT(p6, p20); - SORT(p5, p17); - SORT(p7, p20); - SORT(p4, p16); - SORT(p6, p18); - SORT(p5, p16); - SORT(p7, p19); - SORT(p7, p18); - SORT(p6, p16); - SORT(p7, p17); - SORT(p10, p18); - SORT(p7, p16); - SORT(p9, p17); - SORT(p8, p16); - SORT(p9, p16); - SORT(p10, p16); - - return p10; -} - -/** Sorting network to sort 8 boxes of size 5 and return their median. - * - * @param[in] top2 Values of elements two rows above. - * @param[in] top Values of elements one row above. - * @param[in] middle Values of middle elements. - * @param[in] bottom Values of elements one row below. - * @param[in] bottom2 Values of elements two rows below. - * - * @return Median values for 8 elements. - */ -inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2) -{ - uchar8 p0 = top2.s01234567; - uchar8 p1 = top2.s12345678; - uchar8 p2 = top2.s23456789; - uchar8 p3 = top2.s3456789A; - uchar8 p4 = top2.s456789AB; - uchar8 p5 = top.s01234567; - uchar8 p6 = top.s12345678; - uchar8 p7 = top.s23456789; - uchar8 p8 = top.s3456789A; - uchar8 p9 = top.s456789AB; - uchar8 p10 = middle.s01234567; - uchar8 p11 = middle.s12345678; - uchar8 p12 = middle.s23456789; - uchar8 p13 = middle.s3456789A; - uchar8 p14 = middle.s456789AB; - uchar8 p15 = bottom.s01234567; - uchar8 p16 = bottom.s12345678; - uchar8 p17 = bottom.s23456789; - uchar8 p18 = bottom.s3456789A; - uchar8 p19 = bottom.s456789AB; - uchar8 p20 = bottom2.s01234567; - uchar8 p21 = bottom2.s12345678; - uchar8 p22 = bottom2.s23456789; - uchar8 p23 = bottom2.s3456789A; - uchar8 p24 = bottom2.s456789AB; - - SORT(p1, p2); - SORT(p0, p1); - SORT(p1, p2); - SORT(p4, p5); - SORT(p3, p4); - SORT(p4, p5); - SORT(p0, p3); - SORT(p2, p5); - SORT(p2, p3); - SORT(p1, p4); - SORT(p1, p2); - SORT(p3, p4); - SORT(p7, p8); - SORT(p6, p7); - SORT(p7, p8); - SORT(p10, p11); - SORT(p9, p10); - SORT(p10, p11); - SORT(p6, p9); - SORT(p8, p11); - SORT(p8, p9); - SORT(p7, p10); - SORT(p7, p8); - SORT(p9, p10); - SORT(p0, p6); - SORT(p4, p10); - SORT(p4, p6); - SORT(p2, p8); - SORT(p2, p4); - SORT(p6, p8); - SORT(p1, p7); - SORT(p5, p11); - SORT(p5, p7); - SORT(p3, p9); - SORT(p3, p5); - SORT(p7, p9); - SORT(p1, p2); - SORT(p3, p4); - SORT(p5, p6); - SORT(p7, p8); - SORT(p9, p10); - SORT(p13, p14); - SORT(p12, p13); - SORT(p13, p14); - SORT(p16, p17); - SORT(p15, p16); - SORT(p16, p17); - SORT(p12, p15); - SORT(p14, p17); - SORT(p14, p15); - SORT(p13, p16); - SORT(p13, p14); - SORT(p15, p16); - SORT(p19, p20); - SORT(p18, p19); - SORT(p19, p20); - SORT(p21, p22); - SORT(p23, p24); - SORT(p21, p23); - SORT(p22, p24); - SORT(p22, p23); - SORT(p18, p21); - SORT(p20, p23); - SORT(p20, p21); - SORT(p19, p22); - SORT(p22, p24); - SORT(p19, p20); - SORT(p21, p22); - SORT(p23, p24); - SORT(p12, p18); - SORT(p16, p22); - SORT(p16, p18); - SORT(p14, p20); - SORT(p20, p24); - SORT(p14, p16); - SORT(p18, p20); - SORT(p22, p24); - SORT(p13, p19); - SORT(p17, p23); - SORT(p17, p19); - SORT(p15, p21); - SORT(p15, p17); - SORT(p19, p21); - SORT(p13, p14); - SORT(p15, p16); - SORT(p17, p18); - SORT(p19, p20); - SORT(p21, p22); - SORT(p23, p24); - SORT(p0, p12); - SORT(p8, p20); - SORT(p8, p12); - SORT(p4, p16); - SORT(p16, p24); - SORT(p12, p16); - SORT(p2, p14); - SORT(p10, p22); - SORT(p10, p14); - SORT(p6, p18); - SORT(p6, p10); - SORT(p10, p12); - SORT(p1, p13); - SORT(p9, p21); - SORT(p9, p13); - SORT(p5, p17); - SORT(p13, p17); - SORT(p3, p15); - SORT(p11, p23); - SORT(p11, p15); - SORT(p7, p19); - SORT(p7, p11); - SORT(p11, p13); - SORT(p11, p12); - return p12; -} - -/** This function applies a non linear filter on a 5x5 box basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_box5x5( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar16 top2 = vload16(0, offset(&src, -2, -2)); - uchar16 top = vload16(0, offset(&src, -2, -1)); - uchar16 middle = vload16(0, offset(&src, -2, 0)); - uchar16 bottom = vload16(0, offset(&src, -2, 1)); - uchar16 bottom2 = vload16(0, offset(&src, -2, 2)); - - // Apply respective filter -#ifdef MIN - uchar16 tmp = min(middle, min(min(top2, top), min(bottom, bottom2))); - uchar8 out = row_reduce_min_5(tmp); -#elif defined(MAX) - uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2))); - uchar8 out = row_reduce_max_5(tmp); -#elif defined(MEDIAN) - uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} - -/** This function applies a non linear filter on a 5x5 cross basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_cross5x5( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar8 top2 = vload8(0, offset(&src, 0, -2)); - uchar8 top = vload8(0, offset(&src, 0, -1)); - uchar16 middle = vload16(0, offset(&src, -2, 0)); - uchar8 bottom = vload8(0, offset(&src, 0, 1)); - uchar8 bottom2 = vload8(0, offset(&src, 0, 2)); - - // Apply respective filter -#ifdef MIN - uchar8 tmp_middle = row_reduce_min_5(middle); - uchar8 out = min(tmp_middle, min(min(top2, top), min(bottom, bottom2))); -#elif defined(MAX) - uchar8 tmp_middle = row_reduce_max_5(middle); - uchar8 out = max(tmp_middle, max(max(top2, top.s01234567), max(bottom, bottom2))); -#elif defined(MEDIAN) - uchar8 p0 = top2; - uchar8 p1 = top; - uchar8 p2 = middle.s01234567; - uchar8 p3 = middle.s12345678; - uchar8 p4 = middle.s23456789; - uchar8 p5 = middle.s3456789A; - uchar8 p6 = middle.s456789AB; - uchar8 p7 = bottom; - uchar8 p8 = bottom2; - uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} - -/** This function applies a non linear filter on a 5x5 disk basis on an input image. - * - * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void non_linear_filter_disk5x5( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values - uchar16 top2 = vload16(0, offset(&src, -2, -2)); - uchar16 top = vload16(0, offset(&src, -2, -1)); - uchar16 middle = vload16(0, offset(&src, -2, 0)); - uchar16 bottom = vload16(0, offset(&src, -2, 1)); - uchar16 bottom2 = vload16(0, offset(&src, -2, 2)); - - // Shift top2 and bottom2 values - top2 = top2.s123456789ABCDEFF; - bottom2 = bottom2.s123456789ABCDEFF; - - // Apply respective filter -#ifdef MIN - uchar16 tmp_3 = min(top2, bottom2); - uchar16 tmp_5 = min(middle, min(top, bottom)); - uchar8 tmp_3_red = row_reduce_min_3(tmp_3); - uchar8 tmp_5_red = row_reduce_min_5(tmp_5); - uchar8 out = min(tmp_3_red, tmp_5_red); -#elif defined(MAX) - uchar16 tmp_3 = max(top2, bottom2); - uchar16 tmp_5 = max(middle, max(top, bottom)); - uchar8 tmp_3_red = row_reduce_max_3(tmp_3); - uchar8 tmp_5_red = row_reduce_max_5(tmp_5); - uchar8 out = max(tmp_3_red, tmp_5_red); -#elif defined(MEDIAN) - uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2); -#else /* MIN or MAX or MEDIAN */ -#error "Unsupported filter function" -#endif /* MIN or MAX or MEDIAN */ - - // Store result - vstore8(out, 0, dst.ptr); -} diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h deleted file mode 100644 index 3fcfad46f5..0000000000 --- a/src/core/CL/cl_kernels/non_linear_filter_helpers.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** Sorts element-wise two vectors. - * - * @param[in, out] a First vector - * @param[in, out] b Second vector - */ -#define SORT(a, b) \ - { \ - uchar8 min_val = min(a, b); \ - uchar8 max_val = max(a, b); \ - a = min_val; \ - b = max_val; \ - } - -// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html - -/** Sorting network to sort 5 vectors of 8 elements and return their median. - * - * @param[in] p0 First element vector - * @param[in] p1 Second element vector - * @param[in] p2 Third element vector - * @param[in] p3 Fourth element vector - * @param[in] p4 Fifth element vector - * - * @return Median values for 8 elements. - */ -inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4) -{ - SORT(p0, p1); - SORT(p2, p3); - SORT(p0, p2); - SORT(p1, p3); - SORT(p1, p2); - SORT(p0, p4); - SORT(p1, p4); - SORT(p2, p4); - - return p2; -} - -/** Sorting network to sort 9 vectors of 8 elements and return their median. - * - * @param[in] p0 First element vector - * @param[in] p1 Second element vector - * @param[in] p2 Third element vector - * @param[in] p3 Fourth element vector - * @param[in] p4 Fifth element vector - * @param[in] p5 Sixth element vector - * @param[in] p6 Seventh element vector - * @param[in] p7 Eigth element vector - * @param[in] p8 Ninth element vector - * - * @return Median values for 8 elements. - */ -inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8) -{ - SORT(p1, p2); - SORT(p4, p5); - SORT(p7, p8); - SORT(p0, p1); - SORT(p3, p4); - SORT(p6, p7); - SORT(p1, p2); - SORT(p4, p5); - SORT(p7, p8); - SORT(p0, p3); - SORT(p5, p8); - SORT(p4, p7); - SORT(p3, p6); - SORT(p1, p4); - SORT(p2, p5); - SORT(p4, p7); - SORT(p4, p2); - SORT(p6, p4); - SORT(p4, p2); - - return p4; -} - -/** Calculate the minimum of a sliding window of size 3. - * - * @param val Values to calculate the minimum values - * - * @return Minimum values of 8 elements on a sliding window of size 3. - */ -inline uchar8 row_reduce_min_3(uchar16 val) -{ - return min(val.s01234567, min(val.s12345678, val.s23456789)); -} - -/** Calculate the maximum of a sliding window of size 3. - * - * @param val Values to calculate the maximum values - * - * @return Maximum values of 8 elements on a sliding window of size 3. - */ -inline uchar8 row_reduce_max_3(uchar16 val) -{ - return max(val.s01234567, max(val.s12345678, val.s23456789)); -} - -/** Calculate the minimum of a sliding window of size 5. - * - * @param val Values to calculate the minimum values - * - * @return Minimum values of 8 elements on a sliding window of size 5. - */ -inline uchar8 row_reduce_min_5(uchar16 val) -{ - return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB))); -} - -/** Calculate the maximum of a sliding window of size 5. - * - * @param val Values to calculate the maximum values - * - * @return Maximum values of 8 elements on a sliding window of size 5. - */ -inline uchar8 row_reduce_max_5(uchar16 val) -{ - return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB))); -} diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl deleted file mode 100644 index 9bbde1a57f..0000000000 --- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl +++ /dev/null @@ -1,521 +0,0 @@ -/* - * Copyright (c) 2017-2018 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "types.h" - -/* - *The criteria for lost tracking is that the spatial gradient matrix has: - * - Determinant less than DETERMINANT_THR - * - or minimum eigenvalue is smaller then EIGENVALUE_THR - * - * The thresholds for the determinant and the minimum eigenvalue is - * defined by the OpenVX spec - * - * Note: Also lost tracking happens when the point tracked coordinate is outside - * the image coordinates - * - * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html - */ - -/* Internal Lucas-Kanade Keypoint struct */ -typedef struct InternalKeypoint -{ - float x; /**< The x coordinate. */ - float y; /**< The y coordinate. */ - float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */ - float dummy; /**< Dummy member for alignment. */ -} InternalKeypoint; - -/** Threshold for the determinant. Used for lost tracking criteria */ -#define DETERMINANT_THR 1.0e-07f - -/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */ -#define EIGENVALUE_THR 1.0e-04f - -/** Constants used for Lucas-Kanade Algorithm */ -#define W_BITS (14) -#define FLT_SCALE (1.0f / (float)(1 << 20)) -#define D0 ((float)(1 << W_BITS)) -#define D1 (1.0f / (float)(1 << (W_BITS - 5))) - -/** Initializes the internal new points array when the level of pyramid is NOT equal to max. - * - * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid. - * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid. - * @param[in] scale Scale factor to apply for the new_point coordinates. - */ -__kernel void init_level( - __global float4 *old_points_internal, - __global float4 *new_points_internal, - const float scale) -{ - int idx = get_global_id(0); - - // Get old and new keypoints - float4 old_point = old_points_internal[idx]; - float4 new_point = new_points_internal[idx]; - - // Scale accordingly with the pyramid_scale - old_point.xy *= (float2)(2.0f); - new_point.xy *= (float2)(2.0f); - - old_points_internal[idx] = old_point; - new_points_internal[idx] = new_point; -} - -/** Initializes the internal new points array when the level of pyramid is equal to max. - * - * @param[in] old_points An array of key points that are defined at the old_images high resolution pyramid. - * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid. - * @param[out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid. - * @param[in] scale Scale factor to apply for the new_point coordinates. - */ -__kernel void init_level_max( - __global Keypoint *old_points, - __global InternalKeypoint *old_points_internal, - __global InternalKeypoint *new_points_internal, - const float scale) -{ - int idx = get_global_id(0); - - Keypoint old_point = old_points[idx]; - - // Get old keypoint to track - InternalKeypoint old_point_internal; - old_point_internal.x = old_point.x * scale; - old_point_internal.y = old_point.y * scale; - old_point_internal.tracking_status = 1.f; - - // Store internal keypoints - old_points_internal[idx] = old_point_internal; - new_points_internal[idx] = old_point_internal; -} - -/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1. - * - * @param[in] old_points An array of key points that are defined at the old_images high resolution pyramid. - * @param[in] new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid. - * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid. - * @param[out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid. - * @param[in] scale Scale factor to apply for the new_point coordinates. - */ -__kernel void init_level_max_initial_estimate( - __global Keypoint *old_points, - __global Keypoint *new_points_estimates, - __global InternalKeypoint *old_points_internal, - __global InternalKeypoint *new_points_internal, - const float scale) -{ - int idx = get_global_id(0); - - Keypoint old_point = old_points[idx]; - Keypoint new_point_estimate = new_points_estimates[idx]; - InternalKeypoint old_point_internal; - InternalKeypoint new_point_internal; - - // Get old keypoint to track - old_point_internal.x = old_point.x * scale; - old_point_internal.y = old_point.y * scale; - old_point_internal.tracking_status = 1.f; - - // Get new keypoint to track - new_point_internal.x = new_point_estimate.x * scale; - new_point_internal.y = new_point_estimate.y * scale; - new_point_internal.tracking_status = new_point_estimate.tracking_status; - - // Store internal keypoints - old_points_internal[idx] = old_point_internal; - new_points_internal[idx] = new_point_internal; -} - -/** Truncates the coordinates stored in new_points array - * - * @param[in] new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid. - * @param[out] new_points An array of internal key points that are defined at the new_images high resolution pyramid. - */ -__kernel void finalize( - __global InternalKeypoint *new_points_internal, - __global Keypoint *new_points) -{ - int idx = get_global_id(0); - - // Load internal keypoint - InternalKeypoint new_point_internal = new_points_internal[idx]; - - // Calculate output point - Keypoint new_point; - new_point.x = round(new_point_internal.x); - new_point.y = round(new_point_internal.y); - new_point.strength = 0.f; - new_point.scale = 0.f; - new_point.orientation = 0.f; - new_point.tracking_status = new_point_internal.tracking_status; - new_point.error = 0.f; - - // Store new point - new_points[idx] = new_point; -} - -/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1. - * - * @param[in] old_image_ptr Pointer to the input old image. Supported data types: U8 - * @param[in] old_image_stride_x Stride of the input old image in X dimension (in bytes) - * @param[in] old_image_step_x old_image_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] old_image_stride_y Stride of the input old image in Y dimension (in bytes) - * @param[in] old_image_step_y old_image_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] old_image_offset_first_element_in_bytes The offset of the first element in the input old image - * @param[in] old_scharr_gx_ptr Pointer to the input scharr x image. Supported data types: S16 - * @param[in] old_scharr_gx_stride_x Stride of the input scharr x image in X dimension (in bytes) - * @param[in] old_scharr_gx_step_x old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] old_scharr_gx_stride_y Stride of the input scharr x image in Y dimension (in bytes) - * @param[in] old_scharr_gx_step_y old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image - * @param[in] old_scharr_gy_ptr Pointer to the input scharr y image. Supported data types: S16 - * @param[in] old_scharr_gy_stride_x Stride of the input scharr y image in X dimension (in bytes) - * @param[in] old_scharr_gy_step_x old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] old_scharr_gy_stride_y Stride of the input scharr y image in Y dimension (in bytes) - * @param[in] old_scharr_gy_step_y old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image - * @param[in] old_points An array of key points. Those key points are defined at the old_images high resolution pyramid - * @param[in, out] new_points An output array of key points. Those key points are defined at the new_images high resolution pyramid - * @param[out] coeff It stores | A11 | A12 | A22 | min_eig | for each keypoint - * @param[out] iold_val It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint - * @param[in] window_dimension The size of the window on which to perform the algorithm - * @param[in] window_dimension_pow2 The squared size of the window on which to perform the algorithm - * @param[in] half_window The half size of the window on which to perform the algorithm - * @param[in] border_limits It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,) - * @param[in] eig_const 1.0f / (float)(2.0f * window_dimension * window_dimension) - * @param[in] level0 It is set to 1 if level 0 of the pyramid - */ -void __kernel lktracker_stage0( - IMAGE_DECLARATION(old_image), - IMAGE_DECLARATION(old_scharr_gx), - IMAGE_DECLARATION(old_scharr_gy), - __global float4 *old_points, - __global float4 *new_points, - __global float4 *coeff, - __global short4 *iold_val, - const int window_dimension, - const int window_dimension_pow2, - const int half_window, - const float3 border_limits, - const float eig_const, - const int level0) -{ - int idx = get_global_id(0); - - Image old_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image); - Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx); - Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy); - - // Get old keypoint - float2 old_keypoint = old_points[idx].xy - (float2)half_window; - - // Get the floor value - float2 iold_keypoint = floor(old_keypoint); - - // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point - if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy)) - { - if(level0 == 1) - { - // Invalidate tracked point as we are at level 0 - new_points[idx].s2 = 0.0f; - } - - // Not valid coordinate. It sets min_eig to 0.0f - coeff[idx].s3 = 0.0f; - - return; - } - - // Compute weight for the bilinear interpolation - float2 ab = old_keypoint - iold_keypoint; - - // Weight used for Bilinear-Interpolation on Scharr images - // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y) - // w_scharr.s1 = ab.x * (1.0f - ab.y) - // w_scharr.s2 = (1.0f - ab.x) * ab.y - // w_scharr.s3 = ab.x * ab.y - - float4 w_scharr; - w_scharr.s3 = ab.x * ab.y; - w_scharr.s0 = w_scharr.s3 + 1.0f - ab.x - ab.y; - w_scharr.s12 = ab - (float2)w_scharr.s3; - - // Weight used for Bilinear-Interpolation on Old and New images - // w.s0 = round(w_scharr.s0 * D0) - // w.s1 = round(w_scharr.s1 * D0) - // w.s2 = round(w_scharr.s2 * D0) - // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2 - - float4 w; - w = round(w_scharr * (float4)D0); - w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation - - // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig - int4 iG = (int4)0; - - // Window offset - int window_offset = idx * window_dimension_pow2; - - // Compute Spatial Gradient Matrix G - for(ushort ky = 0; ky < window_dimension; ++ky) - { - int offset_y = iold_keypoint.y + ky; - for(ushort kx = 0; kx < window_dimension; ++kx) - { - int offset_x = iold_keypoint.x + kx; - float4 px; - - // Load values from old_image for computing the bilinear interpolation - px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)), - vload2(0, offset(&old_image, offset_x, offset_y + 1)))); - - // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy - float4 old_i; - - // Compute bilinear interpolation (with D1 scale factor) for ival - old_i.s0 = dot(px, w) * D1; - - // Load values from old_scharr_gx for computing the bilinear interpolation - px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)), - vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1)))); - - // Compute bilinear interpolation for ixval - old_i.s1 = dot(px, w_scharr); - - // Load values from old_scharr_gy for computing the bilinear interpolation - px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)), - vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1)))); - - // Compute bilinear interpolation for iyval - old_i.s2 = dot(px, w_scharr); - - // Rounding (it could be omitted. Used just for matching the VX implementation) - int4 iold = convert_int4(round(old_i)); - - // Accumulate values in the Spatial Gradient Matrix - iG.s0 += (int)(iold.s1 * iold.s1); - iG.s1 += (int)(iold.s1 * iold.s2); - iG.s2 += (int)(iold.s2 * iold.s2); - - // Store ival, ixval and iyval - iold_val[window_offset + kx] = convert_short4(iold); - } - window_offset += window_dimension; - } - - // Scale iA11, iA12 and iA22 - float4 G = convert_float4(iG) * (float4)FLT_SCALE; - - // Compute minimum eigen value - G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const; - - // Store A11. A11, A22 and min_eig - coeff[idx] = G; -} - -/** Computes the motion vector for a given keypoint - * - * @param[in] new_image_ptr Pointer to the input new image. Supported data types: U8 - * @param[in] new_image_stride_x Stride of the input new image in X dimension (in bytes) - * @param[in] new_image_step_x new_image_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] new_image_stride_y Stride of the input new image in Y dimension (in bytes) - * @param[in] new_image_step_y new_image_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] new_image_offset_first_element_in_bytes The offset of the first element in the input new image - * @param[in, out] new_points An output array of key points. Those key points are defined at the new_images high resolution pyramid - * @param[in] coeff The | A11 | A12 | A22 | min_eig | for each keypoint - * @param[in] iold_val The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint - * @param[in] window_dimension The size of the window on which to perform the algorithm - * @param[in] window_dimension_pow2 The squared size of the window on which to perform the algorithm - * @param[in] half_window The half size of the window on which to perform the algorithm - * @param[in] num_iterations The maximum number of iterations - * @param[in] epsilon The value for terminating the algorithm. - * @param[in] border_limits It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,) - * @param[in] eig_const 1.0f / (float)(2.0f * window_dimension * window_dimension) - * @param[in] level0 It is set to 1 if level of pyramid = 0 - * @param[in] term_epsilon It is set to 1 if termination = TERM_CRITERIA_EPSILON - */ -void __kernel lktracker_stage1( - IMAGE_DECLARATION(new_image), - __global float4 *new_points, - __global float4 *coeff, - __global short4 *iold_val, - const int window_dimension, - const int window_dimension_pow2, - const int half_window, - const int num_iterations, - const float epsilon, - const float3 border_limits, - const float eig_const, - const int level0, - const int term_epsilon) -{ - int idx = get_global_id(0); - Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image); - - // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig - float4 G = coeff[idx]; - - // Determinant - float D = G.s0 * G.s2 - G.s1 * G.s1; - - // Check if it is a good point to track - if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR) - { - if(level0 == 1) - { - // Invalidate tracked point as we are at level 0 - new_points[idx].s2 = 0; - } - - return; - } - - // Compute inverse - //D = native_recip(D); - D = 1.0 / D; - - // Get new keypoint - float2 new_keypoint = new_points[idx].xy - (float)half_window; - - // Get new point - float2 out_new_point = new_points[idx].xy; - - // Keep delta obtained in the previous iteration - float2 prev_delta = (float2)0.0f; - - int j = 0; - while(j < num_iterations) - { - // Get the floor value - float2 inew_keypoint = floor(new_keypoint); - - // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point - if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy)) - { - if(level0 == 1) - { - // Invalidate tracked point as we are at level 0 - new_points[idx].s2 = 0.0f; - } - else - { - new_points[idx].xy = out_new_point; - } - - return; - } - - // Compute weight for the bilinear interpolation - float2 ab = new_keypoint - inew_keypoint; - - // Weight used for Bilinear-Interpolation on Old and New images - // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0) - // w.s1 = round(ab.x * (1.0f - ab.y) * D0) - // w.s2 = round((1.0f - ab.x) * ab.y * D0) - // w.s3 = D0 - w.s0 - w.s1 - w.s2 - - float4 w; - w.s3 = ab.x * ab.y; - w.s0 = w.s3 + 1.0f - ab.x - ab.y; - w.s12 = ab - (float2)w.s3; - w = round(w * (float4)D0); - w.s3 = D0 - w.s0 - w.s1 - w.s2; - - // Mismatch vector - int2 ib = 0; - - // Old val offset - int old_val_offset = idx * window_dimension_pow2; - - for(int ky = 0; ky < window_dimension; ++ky) - { - for(int kx = 0; kx < window_dimension; ++kx) - { - // ival, ixval and iyval have been computed in the previous stage - int4 old_ival = convert_int4(iold_val[old_val_offset]); - - // Load values from old_image for computing the bilinear interpolation - float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)), - vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1)))); - - // Compute bilinear interpolation on new image - int jval = (int)round(dot(px, w) * D1); - - // Compute luminance difference - int diff = (int)(jval - old_ival.s0); - - // Accumulate values in mismatch vector - ib += (diff * old_ival.s12); - - // Update old val offset - old_val_offset++; - } - } - - float2 b = convert_float2(ib) * (float2)FLT_SCALE; - - // Optical Flow - float2 delta; - - delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D); - delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D); - - // Update new point coordinate - new_keypoint += delta; - - out_new_point = new_keypoint + (float2)half_window; - - if(term_epsilon == 1) - { - float mag2 = dot(delta, delta); - - if(mag2 <= epsilon) - { - new_points[idx].xy = out_new_point; - - return; - } - } - - // Check convergence analyzing the previous delta - if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f)) - { - out_new_point -= delta * (float2)0.5f; - - new_points[idx].xy = out_new_point; - - return; - } - - // Update previous delta - prev_delta = delta; - - j++; - } - - new_points[idx].xy = out_new_point; -} diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl deleted file mode 100644 index d2868b6731..0000000000 --- a/src/core/CL/cl_kernels/scharr_filter.cl +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This OpenCL kernel computes Scharr3x3. - * - * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient - * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gx_ptr Pointer to the destination image Supported data types: S16 - * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void scharr3x3( - IMAGE_DECLARATION(src) -#ifdef GRAD_X - , - IMAGE_DECLARATION(dst_gx) -#endif /* GRAD_X */ -#ifdef GRAD_Y - , - IMAGE_DECLARATION(dst_gy) -#endif /* GRAD_Y */ -) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); -#ifdef GRAD_X - Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); -#endif /* GRAD_X */ -#ifdef GRAD_Y - Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); -#endif /* GRAD_Y */ - - // Output pixels -#ifdef GRAD_X - short8 gx = (short8)0; -#endif /* GRAD_X */ -#ifdef GRAD_Y - short8 gy = (short8)0; -#endif /* GRAD_Y */ - - // Row0 - uchar16 temp = vload16(0, offset(&src, -1, -1)); - short8 left = convert_short8(temp.s01234567); - short8 middle = convert_short8(temp.s12345678); - short8 right = convert_short8(temp.s23456789); -#ifdef GRAD_X - gx += left * (short8)(-3); - gx += right * (short8)(+3); -#endif /* GRAD_X */ -#ifdef GRAD_Y - gy += left * (short8)(-3); - gy += middle * (short8)(-10); - gy += right * (short8)(-3); -#endif /* GRAD_Y */ - - // Row1 - temp = vload16(0, offset(&src, -1, 0)); - left = convert_short8(temp.s01234567); - right = convert_short8(temp.s23456789); -#ifdef GRAD_X - gx += left * (short8)(-10); - gx += right * (short8)(+10); -#endif /* GRAD_X */ - - // Row2 - temp = vload16(0, offset(&src, -1, 1)); - left = convert_short8(temp.s01234567); - middle = convert_short8(temp.s12345678); - right = convert_short8(temp.s23456789); -#ifdef GRAD_X - gx += left * (short8)(-3); - gx += right * (short8)(+3); -#endif /* GRAD_X */ -#ifdef GRAD_Y - gy += left * (short8)(+3); - gy += middle * (short8)(+10); - gy += right * (short8)(+3); -#endif /* GRAD_Y */ - - // Store results -#ifdef GRAD_X - vstore8(gx, 0, ((__global short *)dst_gx.ptr)); -#endif /* GRAD_X */ -#ifdef GRAD_Y - vstore8(gy, 0, ((__global short *)dst_gy.ptr)); -#endif /* GRAD_Y */ -} diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl deleted file mode 100644 index 0ef1648d94..0000000000 --- a/src/core/CL/cl_kernels/tablelookup.cl +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** This function performs table lookup on U8 input/output images. - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] lut LUT table. Supported data types: U8 - */ -__kernel void tablelookup_U8( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst), - __global uchar *lut) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - /* Load input data */ - uchar8 data = vload8(0, src.ptr); - - /* Load lut data */ - uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3], - lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]); - - /* Store result */ - vstore8(lut_data, 0, dst.ptr); -} - -/** This function performs table lookup on S16 input/output images. - * - * Global Workgroup Size [ DIV_CEIL(width, 8), height ] - * - * @param[in] src_ptr Pointer to the source image. Supported data types: S16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] lut LUT table. Supported data types: S16 - * @param[in] offset LUT offset - * @param[in] count Number of elements in the LUT - */ -__kernel void tablelookup_S16( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst), - __global short *lut, - uint offset, - uint count) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - /* Load input data */ - short8 data = vload8(0, (__global short *)src.ptr); - - /* Load output data */ - int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr)); - - /* Calculate index */ - int8 index = convert_int8(data) + (int8)(offset); - int8 cond = (index >= 0 && index < (int8)count); - index = select(0, index, cond); - - /* Load lut data */ - int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3], - lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]); - - /* Select output data depending on condition */ - lut_data = select(out_data, lut_data, cond); - - /* Store result */ - vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr); -} diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl deleted file mode 100644 index ff3ac05ef4..0000000000 --- a/src/core/CL/cl_kernels/threshold.cl +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/** Perform binary thresholding on an image. - * - * @param[in] in_ptr Pointer to the source image - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] out_ptr Pointer to the destination image - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] false_val False value - * @param[in] true_val True value - * @param[in] threshold The thresold value - */ -__kernel void threshold_binary( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const uchar false_val, - const uchar true_val, - const uchar threshold) -{ - // Get pixels pointer - Image in = CONVERT_TO_IMAGE_STRUCT(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - - // Load data - uchar16 in_data = vload16(0, in.ptr); - - // Perform binary thresholding - in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold); - - // Store result - vstore16(in_data, 0, out.ptr); -} - -/** Perform range thresholding on an image. - * - * @param[in] in_ptr Pointer to the source image - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[out] out_ptr Pointer to the destination image - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] false_val False value - * @param[in] true_val True value - * @param[in] lower Lower threshold - * @param[in] upper Upper threshold - */ -__kernel void threshold_range( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const uchar false_val, - const uchar true_val, - const uchar lower, - const uchar upper) -{ - // Get pixels pointer - Image in = CONVERT_TO_IMAGE_STRUCT(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - - // Load data - uchar16 in_data = vload16(0, in.ptr); - - // Perform range thresholding - in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower); - - // Store result - vstore16(in_data, 0, out.ptr); -} diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl deleted file mode 100644 index 909b92055b..0000000000 --- a/src/core/CL/cl_kernels/warp_affine.cl +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "warp_helpers.h" - -/** Returns a vector of floats contaning the matrix coefficients. */ -inline const float8 build_affine_mtx() -{ - return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0); -} - -/** Transforms 4 2D coordinates using the formula: - * - * x0 = M[1][1] * x + M[1][2] * y + M[1][3] - * y0 = M[2][1] * x + M[2][2] * y + M[2][3] - * - * @param[in] coord 2D coordinate to transform. - * @param[in] mtx affine matrix - * - * @return a int8 containing 4 2D transformed values. - */ -inline const float8 apply_affine_transform(const float2 coord, const float8 mtx) -{ - const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); - // transform [x,x+1,x+2,x+3] - const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4))); - // transform [y,y+1,y+2,y+3] - const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5))); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -} - -/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8. - * - * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation: - * x0 = M[1][1] * x + M[1][2] * y + M[1][3] - * y0 = M[2][1] * x + M[2][2] * y + M[2][3] - * output(x,y) = input(x0,y0) - * - * @attention The matrix coefficients need to be passed at compile time:\n - * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n - * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] width Width of the destination image - * @param[in] height Height of the destination image - */ -__kernel void warp_affine_nearest_neighbour( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const int width, - const int height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr); -} - -/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8. - * - * @attention The matrix coefficients need to be passed at compile time:\n - * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n - * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] width Width of the destination image - * @param[in] height Height of the destination image - */ -__kernel void warp_affine_bilinear( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const int width, - const int height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - vstore4(bilinear_interpolate(&in, apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), 0, out.ptr); -} diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl deleted file mode 100644 index bed78388a4..0000000000 --- a/src/core/CL/cl_kernels/warp_perspective.cl +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "warp_helpers.h" - -/** Returns the perspective matrix */ -inline const float16 build_perspective_mtx() -{ - return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0); -} - -/** Transforms four 2D coordinates using the formula: - * - * x0 = M[1][1] * x + M[1][2] * y + M[1][3] - * y0 = M[2][1] * x + M[2][2] * y + M[2][3] - * z0 = M[3][1] * x + M[3][2] * y + M[3][3] - * - * (x0/z0,y0/z0) - * - * @param[in] coord 2D coordinate to transform. - * @param[in] mtx perspective matrix - * - * @return a vector float8 containing four 2D transformed values. - */ -inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx) -{ - const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); - // transform [z,z+1,z+2,z+3] - const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8))); - // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation - // transform [x,x+1,x+2,x+3] - const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z; - // transform [y,y+1,y+2,y+3] - const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z; - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -} - -/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8. - * - * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation: - * x0 = M[1][1] * x + M[1][2] * y + M[1][3] - * y0 = M[2][1] * x + M[2][2] * y + M[2][3] - * z0 = M[3][1] * x + M[3][2] * y + M[3][3] - * - * output(x,y) = input(x0/z0,y0/z0) - * - * @attention The matrix coefficients need to be passed at compile time:\n - * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n - * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] width Width of the destination image - * @param[in] height Height of the destination image - */ -__kernel void warp_perspective_nearest_neighbour( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const int width, - const int height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr); -} - -/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8. - * - * @attention The matrix coefficients need to be passed at compile time:\n - * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n - * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] width Width of the destination image - * @param[in] height Height of the destination image - */ -__kernel void warp_perspective_bilinear( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const int width, - const int height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - vstore4(bilinear_interpolate(&in, apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), 0, out.ptr); -} diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp deleted file mode 100644 index 76b60cb9f8..0000000000 --- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" - -#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include - -using namespace arm_compute; - -CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - -void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); -} - -void CLAbsoluteDifferenceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), - "The output image can only be U8 if both input images are U8"); - - _input1 = input1; - _input2 = input2; - _output = output; - - // Set kernel build options - std::set build_opts; - build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); - build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); - build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - - // Create kernel - _kernel = create_kernel(compile_context, "absdiff", build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 16; - - Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input1_access, input2_access, output_access); - - ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), - input2->info()->valid_region()); - - output_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); -} - -void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input1, slice); - add_2D_tensor_argument(idx, _input2, slice); - add_2D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h deleted file mode 100644 index 28f28fe44f..0000000000 --- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H -#define ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the absolute difference kernel. - * - * Absolute difference is computed by: - * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f] - */ -class CLAbsoluteDifferenceKernel : public ICLKernel -{ -public: - /** Default constructor. */ - CLAbsoluteDifferenceKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLAbsoluteDifferenceKernel(const CLAbsoluteDifferenceKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLAbsoluteDifferenceKernel &operator=(const CLAbsoluteDifferenceKernel &) = delete; - /** Allow instances of this class to be moved */ - CLAbsoluteDifferenceKernel(CLAbsoluteDifferenceKernel &&) = default; - /** Allow instances of this class to be moved */ - CLAbsoluteDifferenceKernel &operator=(CLAbsoluteDifferenceKernel &&) = default; - /** Default destructor */ - ~CLAbsoluteDifferenceKernel() = default; - - /** Set the inputs and output images. - * - * @param[in] input1 Source tensor. Data types supported: U8/S16. - * @param[in] input2 Source tensor. Data types supported: U8/S16. - * @param[out] output Destination tensor. Data types supported: U8/S16. - */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); - /** Set the inputs and output images. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input1 Source tensor. Data types supported: U8/S16. - * @param[in] input2 Source tensor. Data types supported: U8/S16. - * @param[out] output Destination tensor. Data types supported: U8/S16. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input1; /**< Source tensor 1. */ - const ICLTensor *_input2; /**< Source tensor 2. */ - ICLTensor *_output; /**< Destination tensor. */ -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H */ diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp deleted file mode 100644 index b0a8eba644..0000000000 --- a/src/core/CL/kernels/CLAccumulateKernel.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLAccumulateKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" - -namespace arm_compute -{ -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; -} // namespace - -void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, accum); -} - -void CLAccumulateKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16); - - // Create kernel - _kernel = create_kernel(compile_context, "accumulate"); - - // Make sure _kernel is initialized before calling the parent's configure - ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration); -} - -void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum); -} - -void CLAccumulateWeightedKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0); - - // Create kernel - _kernel = create_kernel(compile_context, "accumulate_weighted"); - - // Set static kernel arguments - unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, alpha); - - // Configure kernel window - ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration); -} - -void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum); -} - -void CLAccumulateSquaredKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON(shift > 15); - - // Create kernel - _kernel = create_kernel(compile_context, "accumulate_squared"); - - // Set static kernel arguments - unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, shift); - - // Configure kernel window - ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLAccumulateKernel.h b/src/core/CL/kernels/CLAccumulateKernel.h deleted file mode 100644 index 16a715319d..0000000000 --- a/src/core/CL/kernels/CLAccumulateKernel.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLACCUMULATEKERNEL_H -#define ARM_COMPUTE_CLACCUMULATEKERNEL_H - -#include "src/core/CL/ICLSimple2DKernel.h" - -#include - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the accumulate kernel. - * - * Accumulation is computed by: - * @f[ accum(x,y) = accum(x,y) + input(x,y) @f] - */ -class CLAccumulateKernel : public ICLSimple2DKernel -{ -public: - /** Set the input and accumulation tensors. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] accum Destination tensor. Data types supported: S16. - */ - void configure(const ICLTensor *input, ICLTensor *accum); - /** Set the input and accumulation tensors. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] accum Destination tensor. Data types supported: S16. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum); -}; - -/** Interface for the accumulate weighted kernel. - * - * Weighted accumulation is computed: - * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f] - * - * Where @f$ 0 \le \alpha \le 1 @f$ - * Conceptually, the rounding for this is defined as: - * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f] -*/ -class CLAccumulateWeightedKernel : public ICLSimple2DKernel -{ -public: - /** Set the input and accumulation images, and the scale value. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[in] alpha Scalar value in the range [0, 1.0]. Data types supported: F32. - * @param[in,out] accum Accumulated tensor. Data types supported: U8. - */ - void configure(const ICLTensor *input, float alpha, ICLTensor *accum); - /** Set the input and accumulation images, and the scale value. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[in] alpha Scalar value in the range [0, 1.0]. Data types supported: F32. - * @param[in,out] accum Accumulated tensor. Data types supported: U8. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum); -}; - -/** Interface for the accumulate squared kernel. - * - * The accumulation of squares is computed: - * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f] - * - * Where @f$ 0 \le shift \le 15 @f$ -*/ -class CLAccumulateSquaredKernel : public ICLSimple2DKernel -{ -public: - /** Set the input and accumulation tensors and the shift value. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[in] shift Shift value in the range of [0, 15]. Data types supported: U32. - * @param[in,out] accum Accumulated tensor. Data types supported: S16. - */ - void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum); - /** Set the input and accumulation tensors and the shift value. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[in] shift Shift value in the range of [0, 15]. Data types supported: U32. - * @param[in,out] accum Accumulated tensor. Data types supported: S16. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum); -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLACCUMULATEKERNEL_H */ diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp deleted file mode 100644 index 9f493b4fb8..0000000000 --- a/src/core/CL/kernels/CLBox3x3Kernel.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLBox3x3Kernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include - -using namespace arm_compute; - -BorderSize CLBox3x3Kernel::border_size() const -{ - return BorderSize(1); -} - -void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined); -} - -void CLBox3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - - _input = input; - _output = output; - - // Set build options - std::set build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1", - "-DMAT3=1", "-DMAT4=1", "-DMAT5=1", - "-DMAT6=1", "-DMAT7=1", "-DMAT8=1", - "-DSCALE=9", "-DDATA_TYPE_OUT=uchar" - }; - - // Create kernel - _kernel = create_kernel(compile_context, "convolution3x3_static", build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_rows_read_per_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); -} diff --git a/src/core/CL/kernels/CLBox3x3Kernel.h b/src/core/CL/kernels/CLBox3x3Kernel.h deleted file mode 100644 index 2373c4a928..0000000000 --- a/src/core/CL/kernels/CLBox3x3Kernel.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLBOX3X3KERNEL_H -#define ARM_COMPUTE_CLBOX3X3KERNEL_H - -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the box 3x3 filter kernel. - * - */ -class CLBox3x3Kernel : public ICLSimple2DKernel -{ -public: - /**Initialise the kernel's input and output. - * - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined); - /**Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined); - - //Inherited methods overriden: - BorderSize border_size() const override; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLBOX3X3KERNEL_H */ diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp deleted file mode 100644 index 1fe944c8a2..0000000000 --- a/src/core/CL/kernels/CLCannyEdgeKernel.cpp +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLCannyEdgeKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -using namespace arm_compute; - -CLGradientKernel::CLGradientKernel() - : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr) -{ -} - -void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type) -{ - configure(CLKernelLibrary::get().get_compile_context(), gx, gy, magnitude, phase, norm_type); -} - -void CLGradientKernel::configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()), - "Gx and Gy must have the same pixel size"); - ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()), - "Mag must have the same pixel size as Gx and Gy"); - - _gx = gx; - _gy = gy; - _magnitude = magnitude; - _phase = phase; - - // Create build opts - std::set built_opts; - built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type())); - built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type())); - - // Create kernel - const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2"); - _kernel = create_kernel(compile_context, kernel_name, built_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 4; - - Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access); - - mag_access.set_valid_region(win, _gx->info()->valid_region()); - phase_access.set_valid_region(win, _gx->info()->valid_region()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(gx->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(gx->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gx->info()->dimension(1)); -} - -void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _gx, slice); - add_2D_tensor_argument(idx, _gy, slice); - add_2D_tensor_argument(idx, _magnitude, slice); - add_2D_tensor_argument(idx, _phase, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} - -CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel() - : _magnitude(nullptr), _phase(nullptr), _output(nullptr) -{ -} - -BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const -{ - return BorderSize(1); -} - -void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), magnitude, phase, output, lower_thr, border_undefined); -} - -void CLEdgeNonMaxSuppressionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32); - - _magnitude = magnitude; - _phase = phase; - _output = output; - - // Create build opts - std::set built_opts; - built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type())); - built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - - // Create kernel - const std::string kernel_name = std::string("suppress_non_maximum"); - _kernel = create_kernel(compile_context, kernel_name, built_opts); - - // Set minimum threshold argument - unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, lower_thr); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 1; - constexpr unsigned int num_elems_read_written_per_iteration = 3; - - Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top, - num_elems_read_written_per_iteration, num_elems_read_written_per_iteration); - AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, mag_access, phase_access, output_access); - - output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(output->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} - -void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _magnitude, slice); - add_2D_tensor_argument(idx, _phase, slice); - add_2D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} - -CLEdgeTraceKernel::CLEdgeTraceKernel() - : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr) -{ -} - -void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, - ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, visited, recorded, l1_stack, l1_stack_counter); -} - -void CLEdgeTraceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, - ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8); - - _input = input; - _output = output; - _lower_thr = lower_thr; - _upper_thr = upper_thr; - _visited = visited; - _recorded = recorded; - _l1_stack = l1_stack; - _l1_stack_counter = l1_stack_counter; - - // Create build opts - std::set built_opts; - built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); - built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); - - // Create kernel - const std::string kernel_name = std::string("hysteresis"); - _kernel = create_kernel(compile_context, kernel_name, built_opts); - - // Set constant kernel args - unsigned int width = _input->info()->dimension(0); - unsigned int height = _input->info()->dimension(1); - unsigned int idx = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, static_cast(_lower_thr)); - _kernel.setArg(idx++, static_cast(_upper_thr)); - _kernel.setArg(idx++, static_cast(width)); - _kernel.setArg(idx++, static_cast(height)); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 1; - Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration), - output_access, - visited_access, - recorded_access, - l1_stack_access, - l1_stack_counter_access); - - output_access.set_valid_region(win, _input->info()->valid_region()); - visited_access.set_valid_region(win, _input->info()->valid_region()); - recorded_access.set_valid_region(win, _input->info()->valid_region()); - l1_stack_access.set_valid_region(win, _input->info()->valid_region()); - l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += lower_string(string_from_format(output->info()->format())); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); -} - -void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument(idx, _output, slice); - add_2D_tensor_argument(idx, _visited, slice); - add_2D_tensor_argument(idx, _recorded, slice); - add_2D_tensor_argument(idx, _l1_stack, slice); - add_2D_tensor_argument(idx, _l1_stack_counter, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.h b/src/core/CL/kernels/CLCannyEdgeKernel.h deleted file mode 100644 index 7543822d8d..0000000000 --- a/src/core/CL/kernels/CLCannyEdgeKernel.h +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLCANNYEDGEKERNEL_H -#define ARM_COMPUTE_CLCANNYEDGEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform Gradient computation. - */ -class CLGradientKernel : public ICLKernel -{ -public: - /** Constructor */ - CLGradientKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGradientKernel(const CLGradientKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGradientKernel &operator=(const CLGradientKernel &) = delete; - /** Initialise the kernel's sources, destinations and border mode. - * - * @note gx, gy and mag must all be the same size (either 16 or 32). - * - * @param[in] gx Source tensor - Gx component. Data types supported: S16/S32. - * @param[in] gy Source tensor - Gy component. Data types supported: Same as gx. - * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy. - * @param[out] phase Destination tensor - Quantized phase. Data types supported: U8. - * @param[in] norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm. - */ - void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type); - /** Initialise the kernel's sources, destinations and border mode. - * - * @note gx, gy and mag must all be the same size (either 16 or 32). - * - * @param[in] compile_context The compile context to be used. - * @param[in] gx Source tensor - Gx component. Data types supported: S16/S32. - * @param[in] gy Source tensor - Gy component. Data types supported: Same as gx. - * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy. - * @param[out] phase Destination tensor - Quantized phase. Data types supported: U8. - * @param[in] norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_gx; /**< Source tensor - Gx component */ - const ICLTensor *_gy; /**< Source tensor - Gy component */ - ICLTensor *_magnitude; /**< Destination tensor - Magnitude */ - ICLTensor *_phase; /**< Destination tensor - Quantized phase */ -}; - -/** OpenCL kernel to perform Non-Maxima suppression for Canny Edge. - * - * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input - * to characterize points as possible edges. The output buffer needs to be cleared before this kernel is executed. - * - * @note Hysteresis is computed in @ref CLEdgeTraceKernel - */ -class CLEdgeNonMaxSuppressionKernel : public ICLKernel -{ -public: - /** Constructor */ - CLEdgeNonMaxSuppressionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLEdgeNonMaxSuppressionKernel(const CLEdgeNonMaxSuppressionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLEdgeNonMaxSuppressionKernel &operator=(const CLEdgeNonMaxSuppressionKernel &) = delete; - /** Initialise the kernel's sources, destination and border mode. - * - * @param[in] magnitude Source tensor - Magnitude. Data types supported: U16/U32. - * @param[in] phase Source tensor - Quantized phase. Data types supported: U8. - * @param[out] output Destination tensor. Data types supported: U16/U32. - * @param[in] lower_thr Lower threshold. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined); - /** Initialise the kernel's sources, destination and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] magnitude Source tensor - Magnitude. Data types supported: U16/U32. - * @param[in] phase Source tensor - Quantized phase. Data types supported: U8. - * @param[out] output Destination tensor. Data types supported: U16/U32. - * @param[in] lower_thr Lower threshold. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */ - const ICLTensor *_phase; /**< Source tensor - Quantized phase. */ - ICLTensor *_output; /**< Destination tensor. */ -}; - -/** OpenCL kernel to perform Edge tracing. - */ -class CLEdgeTraceKernel : public ICLKernel -{ -public: - /** Constructor */ - CLEdgeTraceKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLEdgeTraceKernel(const CLEdgeTraceKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete; - /** Initialise the kernel's source, destination and border mode. - * - * @param[in] input Source tensor. Data types supported: U16/U32. - * @param[out] output Destination tensor. Data types supported: U8. - * @param[in] upper_thr Upper threshold used for the hysteresis - * @param[in] lower_thr Lower threshold used for the hysteresis - * @param[in,out] visited Tensor for keeping the visited pixels. Data types supported: U32. - * Expected to be initialized to 0 before each run. - * @param[in,out] recorded Tensor for keeping the recorded pixels. Data types supported: U32 - * Expected to be initialized to 0 before each run. - * @param[in,out] l1_stack Tensor with the L1 stack for each pixel. Data types supported: S32. - * Expected to be initialized to 0 before each run. - * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8. - * Expected to be initialized to 0 before each run. - */ - void configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, - ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter); - /** Initialise the kernel's source, destination and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U16/U32. - * @param[out] output Destination tensor. Data types supported: U8. - * @param[in] upper_thr Upper threshold used for the hysteresis - * @param[in] lower_thr Lower threshold used for the hysteresis - * @param[in,out] visited Tensor for keeping the visited pixels. Data types supported: U32. - * Expected to be initialized to 0 before each run. - * @param[in,out] recorded Tensor for keeping the recorded pixels. Data types supported: U32 - * Expected to be initialized to 0 before each run. - * @param[in,out] l1_stack Tensor with the L1 stack for each pixel. Data types supported: S32. - * Expected to be initialized to 0 before each run. - * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8. - * Expected to be initialized to 0 before each run. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, - ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Source tensor. */ - ICLTensor *_output; /**< Destination tensor. */ - int32_t _lower_thr; /**< Lower threshold used for the hysteresis. */ - int32_t _upper_thr; /**< Upper threshold used for the hysteresis. */ - ICLTensor *_visited; /**< Marks visited elements */ - ICLTensor *_recorded; /**< Marks recorded elements */ - ICLTensor *_l1_stack; /**< L1 hysteris stack */ - ICLTensor *_l1_stack_counter; /**< L1 hysteris stack counter */ -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLCANNYEDGEKERNEL_H */ diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp deleted file mode 100644 index 52ba9dd065..0000000000 --- a/src/core/CL/kernels/CLChannelCombineKernel.cpp +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLChannelCombineKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLMultiImage.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/MultiImageInfo.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include - -namespace arm_compute -{ -namespace -{ -constexpr unsigned int num_elems_processed_per_iteration = 16; -} // namespace - -CLChannelCombineKernel::CLChannelCombineKernel() - : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } } -{ -} - -void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output); -} - -void CLChannelCombineKernel::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); - - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8); - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8); - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8); - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8); - - const Format output_format = output->info()->format(); - - // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes - if(Format::YUYV422 == output_format || Format::UYVY422 == output_format) - { - // Validate Y plane of input and output - ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output); - - // Validate U and V plane of the input - ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2); - } - - _planes[0] = plane0; - _planes[1] = plane1; - _planes[2] = plane2; - _planes[3] = nullptr; - - // Validate the last input tensor only for RGBA format - if(Format::RGBA8888 == output_format) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(plane3); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3); - - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8); - - _planes[3] = plane3; - } - - _output = output; - _output_multi = nullptr; - - // Half the processed elements for U and V channels due to horizontal sub-sampling of 2 - if(Format::YUYV422 == output_format || Format::UYVY422 == output_format) - { - _x_subsampling[1] = 2; - _x_subsampling[2] = 2; - } - - // Create kernel - std::string kernel_name = "channel_combine_" + string_from_format(output_format); - _kernel = create_kernel(compile_context, kernel_name); - - // Configure window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration); - AccessWindowRectangle plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); - AccessWindowRectangle plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); - AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access); - - ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(), - plane1->info()->valid_region(), - plane2->info()->valid_region()); - if(plane3 != nullptr) - { - valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region); - } - output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output); -} - -void CLChannelCombineKernel::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2); - - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8); - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8); - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8); - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8); - - const Format output_format = output->info()->format(); - - // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes - // Perform validation only for formats which require sub-sampling. - if(Format::YUV444 != output_format) - { - // Validate Y plane of input and output - ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0)); - - // Validate U and V plane of the input - ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2); - - // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane) - // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size - ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1)); - - // Validate the last plane V of format IYUV - if(Format::IYUV == output_format) - { - // Validate Y plane of the output - ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2)); - } - } - - // Set input tensors - _planes[0] = plane0; - _planes[1] = plane1; - _planes[2] = plane2; - _planes[3] = nullptr; - - // Set output tensor - _output = nullptr; - _output_multi = output; - - bool has_two_planars = false; - - // Set sub-sampling parameters for each plane - std::string kernel_name; - std::set build_opts; - - if(Format::NV12 == output_format || Format::NV21 == output_format) - { - _x_subsampling = { { 1, 2, 2 } }; - _y_subsampling = { { 1, 2, 2 } }; - kernel_name = "channel_combine_NV"; - build_opts.emplace(Format::NV12 == output_format ? "-DNV12" : "-DNV21"); - has_two_planars = true; - } - else - { - if(Format::IYUV == output_format) - { - _x_subsampling = { { 1, 2, 2 } }; - _y_subsampling = { { 1, 2, 2 } }; - } - - kernel_name = "copy_planes_3p"; - build_opts.emplace(Format::IYUV == output_format ? "-DIYUV" : "-DYUV444"); - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure window - Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowRectangle input_plane0_access(plane0->info(), 0, 0, num_elems_processed_per_iteration, 1.f); - AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); - AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); - AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f, 1.f / _y_subsampling[1]); - AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); - AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); - - update_window_and_padding(win, - input_plane0_access, input_plane1_access, input_plane2_access, - output_plane0_access, output_plane1_access, output_plane2_access); - - ValidRegion plane0_valid_region = plane0->info()->valid_region(); - ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region(); - output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape())); - output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape())); - output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - slice.set_dimension_step(Window::DimY, 1); - - do - { - // Subsampling in plane 1 - Window win_sub_plane1(slice); - win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1])); - win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1)); - - // Subsampling in plane 2 - Window win_sub_plane2(slice); - win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2])); - win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1)); - - unsigned int idx = 0; - - // Set inputs - add_2D_tensor_argument(idx, _planes[0], slice); - add_2D_tensor_argument(idx, _planes[1], win_sub_plane1); - add_2D_tensor_argument(idx, _planes[2], win_sub_plane2); - add_2D_tensor_argument_if((nullptr != _planes[3]), idx, _planes[3], slice); - - // Set outputs - if(nullptr != _output) // Single planar output - { - add_2D_tensor_argument(idx, _output, slice); - } - else // Multi-planar output - { - // Reduce slice in case of subsampling to avoid out-of bounds access - slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1)); - - add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice); - add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1); - add_2D_tensor_argument_if((3 == num_planes_from_format(_output_multi->info()->format())), idx, _output_multi->cl_plane(2), win_sub_plane2); - - _kernel.setArg(idx++, slice.y().end()); - } - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLChannelCombineKernel.h b/src/core/CL/kernels/CLChannelCombineKernel.h deleted file mode 100644 index f19995aa8e..0000000000 --- a/src/core/CL/kernels/CLChannelCombineKernel.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H -#define ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -#include -#include - -namespace arm_compute -{ -class ICLMultiImage; -class ICLTensor; -using ICLImage = ICLTensor; - -/** Interface for the channel combine kernel */ -class CLChannelCombineKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLChannelCombineKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLChannelCombineKernel(const CLChannelCombineKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLChannelCombineKernel &operator=(const CLChannelCombineKernel &) = delete; - /** Allow instances of this class to be moved */ - CLChannelCombineKernel(CLChannelCombineKernel &&) = default; - /** Allow instances of this class to be moved */ - CLChannelCombineKernel &operator=(CLChannelCombineKernel &&) = default; - /** Default destructor */ - ~CLChannelCombineKernel() = default; - /** Configure function's inputs and outputs. - * - * @param[in] plane0 The 2D plane that forms channel 0. Must be of U8 format. - * @param[in] plane1 The 2D plane that forms channel 1. Must be of U8 format. - * @param[in] plane2 The 2D plane that forms channel 2. Must be of U8 format. - * @param[in] plane3 The 2D plane that forms channel 3. Must be of U8 format. - * @param[out] output The single planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422. - */ - void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output); - /** Configure function's inputs and outputs. - * - * @param[in] compile_context The compile context to be used. - * @param[in] plane0 The 2D plane that forms channel 0. Must be of U8 format. - * @param[in] plane1 The 2D plane that forms channel 1. Must be of U8 format. - * @param[in] plane2 The 2D plane that forms channel 2. Must be of U8 format. - * @param[in] plane3 The 2D plane that forms channel 3. Must be of U8 format. - * @param[out] output The single planar output tensor. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output); - /** Configure function's inputs and outputs. - * - * @param[in] plane0 The 2D plane that forms channel 0. Must be of U8 format. - * @param[in] plane1 The 2D plane that forms channel 1. Must be of U8 format. - * @param[in] plane2 The 2D plane that forms channel 2. Must be of U8 format. - * @param[out] output The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422. - */ - void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output); - /** Configure function's inputs and outputs. - * - * @param[in] compile_context The compile context to be used. - * @param[in] plane0 The 2D plane that forms channel 0. Must be of U8 format. - * @param[in] plane1 The 2D plane that forms channel 1. Must be of U8 format. - * @param[in] plane2 The 2D plane that forms channel 2. Must be of U8 format. - * @param[out] output The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422. - */ - void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - std::array _planes; - ICLTensor *_output; - ICLMultiImage *_output_multi; - std::array _x_subsampling; - std::array _y_subsampling; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H */ diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp deleted file mode 100644 index cbf504b98b..0000000000 --- a/src/core/CL/kernels/CLChannelExtractKernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLChannelExtractKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLMultiImage.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/MultiImageInfo.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include - -using namespace arm_compute; - -CLChannelExtractKernel::CLChannelExtractKernel() - : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1) -{ -} - -void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, channel, output); -} - -void CLChannelExtractKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_ON(input == output); - - set_format_if_unknown(*output->info(), Format::U8); - - // Check if input tensor has a valid format - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422); - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); - - // Check if channel is valid for given format - const Format format = input->info()->format(); - ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel); - - // Half the processed elements for U,V channels due to sub-sampling of 2 - _subsampling = 1; - - if(format == Format::YUYV422 || format == Format::UYVY422) - { - // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422) - ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input); - - if(channel != Channel::Y) - { - _subsampling = 2; - } - } - - // Calculate output tensor shape using subsampling - TensorShape output_shape = calculate_subsampled_shape(input->info()->tensor_shape(), format, channel); - set_shape_if_empty(*output->info(), output_shape); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - - _input = input; - _output = output; - - // Create kernel - std::string kernel_name = "channel_extract_" + string_from_format(format); - std::set build_opts = { ("-DCHANNEL_" + string_from_channel(channel)) }; - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure window - Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration); - AccessWindowRectangle output_access(output->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling); - - update_window_and_padding(win, input_access, output_access); - - ValidRegion input_valid_region = input->info()->valid_region(); - output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} - -void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, channel, output); -} - -void CLChannelExtractKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); - - set_format_if_unknown(*output->info(), Format::U8); - - // Check if channel is valid for given format - const Format format = input->info()->format(); - ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel); - - // Get input plane from the given channel - const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(format, channel)); - ARM_COMPUTE_ERROR_ON_NULLPTR(input_plane); - - if(Channel::Y == channel && format != Format::YUV444) - { - // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422) - ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input_plane); - } - - // Calculate 2x2 subsampled tensor shape - TensorShape output_shape = calculate_subsampled_shape(input->cl_plane(0)->info()->tensor_shape(), format, channel); - set_shape_if_empty(*output->info(), output_shape); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output_shape, output->info()->tensor_shape()); - - // Check if input tensor has a valid format - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444); - ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8); - - _output = output; - _input = input_plane; - _subsampling = 1; - - // Create kernel - std::string kernel_name; - std::set build_opts; - if(Channel::Y == channel || Format::IYUV == format || Format::YUV444 == format) - { - kernel_name = "copy_plane"; - } - else - { - kernel_name = "channel_extract_" + string_from_format(format); - build_opts.insert(("-DCHANNEL_" + string_from_channel(channel))); - } - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure window - Window win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input_plane->info(), 0, _num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input_plane->info()->valid_region()); - - ICLKernel::configure_internal(win); -} - -void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - - do - { - Window win_sub(slice); - win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling)); - win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1)); - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument(idx, _output, win_sub); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLChannelExtractKernel.h b/src/core/CL/kernels/CLChannelExtractKernel.h deleted file mode 100644 index 37abde548c..0000000000 --- a/src/core/CL/kernels/CLChannelExtractKernel.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H -#define ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/CL/ICLKernel.h" - -#include - -namespace arm_compute -{ -class ICLMultiImage; -class ICLTensor; -using ICLImage = ICLTensor; - -/** Interface for the channel extract kernel */ -class CLChannelExtractKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLChannelExtractKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLChannelExtractKernel(const CLChannelExtractKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLChannelExtractKernel &operator=(const CLChannelExtractKernel &) = delete; - /** Allow instances of this class to be moved */ - CLChannelExtractKernel(CLChannelExtractKernel &&) = default; - /** Allow instances of this class to be moved */ - CLChannelExtractKernel &operator=(CLChannelExtractKernel &&) = default; - /** Default destructor */ - ~CLChannelExtractKernel() = default; - /** Set the input and output of the kernel - * - * @param[in] input Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422 - * @param[in] channel Channel to extract. - * @param[out] output Destination tensor. Must be of U8 format. - */ - void configure(const ICLTensor *input, Channel channel, ICLTensor *output); - /** Set the input and output of the kernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422 - * @param[in] channel Channel to extract. - * @param[out] output Destination tensor. Must be of U8 format. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output); - /** Set the input and output of the kernel - * - * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444 - * @param[in] channel Channel to extract. - * @param[out] output Single-planar 2D destination image. Must be of U8 format. - */ - void configure(const ICLMultiImage *input, Channel channel, ICLImage *output); - /** Set the input and output of the kernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444 - * @param[in] channel Channel to extract. - * @param[out] output Single-planar 2D destination image. Must be of U8 format. - */ - void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - ICLTensor *_output; - uint32_t _num_elems_processed_per_iteration; - uint32_t _subsampling; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H */ diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp deleted file mode 100644 index 6c61fec997..0000000000 --- a/src/core/CL/kernels/CLColorConvertKernel.cpp +++ /dev/null @@ -1,558 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLColorConvertKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLMultiImage.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/MultiImageInfo.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include - -using namespace arm_compute; - -CLColorConvertKernel::CLColorConvertKernel() - : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr) -{ -} - -void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON(input == nullptr); - ARM_COMPUTE_ERROR_ON(output == nullptr); - - unsigned int num_elems_processed_per_iteration = 0; - switch(input->info()->format()) - { - case Format::RGBA8888: - { - switch(output->info()->format()) - { - case Format::RGB888: - num_elems_processed_per_iteration = 16; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - break; - } - case Format::UYVY422: - case Format::YUYV422: - { - switch(output->info()->format()) - { - case Format::RGB888: - case Format::RGBA8888: - num_elems_processed_per_iteration = 8; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - break; - } - case Format::RGB888: - { - switch(output->info()->format()) - { - case Format::RGBA8888: - case Format::U8: - num_elems_processed_per_iteration = 16; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - break; - } - default: - break; - } - ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", - string_from_format(input->info()->format()).c_str(), - string_from_format(output->info()->format()).c_str()); - - std::stringstream kernel_name; - - kernel_name << string_from_format(input->info()->format()); - kernel_name << "_to_"; - kernel_name << string_from_format(output->info()->format()); - kernel_name << "_bt709"; - - _input = input; - _output = output; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name.str()); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name.str(); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); - ARM_COMPUTE_ERROR_ON(output == nullptr); - - unsigned int num_elems_processed_per_iteration = 0; - - switch(input->info()->format()) - { - case Format::NV12: - case Format::NV21: - case Format::IYUV: - { - switch(output->info()->format()) - { - case Format::RGB888: - case Format::RGBA8888: - num_elems_processed_per_iteration = 4; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - break; - } - default: - break; - } - ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", - string_from_format(input->info()->format()).c_str(), - string_from_format(output->info()->format()).c_str()); - - std::stringstream kernel_name; - - kernel_name << string_from_format(input->info()->format()); - kernel_name << "_to_"; - kernel_name << string_from_format(output->info()->format()); - kernel_name << "_bt709"; - - _multi_input = input; - _output = output; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name.str()); - - // Configure kernel window - const bool has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21); - const float sub_sampling = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1; - - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - win.set_dimension_step(Window::DimY, 2); - - AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration); - AccessWindowRectangle plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, - sub_sampling, sub_sampling); - AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, - sub_sampling, sub_sampling); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - plane0_access, plane1_access, plane2_access, - output_access); - - ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(), - input->plane(2)->info()->valid_region()); - output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name.str(); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->plane(0)->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(1)); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->plane(1)->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(1)); -} - -void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON(output == nullptr); - - unsigned int num_elems_processed_per_iteration = 0; - unsigned int num_elems_read_per_iteration_x = 0; - - bool has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21); - float sub_sampling = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1; - - switch(input->info()->format()) - { - case Format::RGB888: - case Format::RGBA8888: - { - switch(output->info()->format()) - { - case Format::NV12: - case Format::IYUV: - num_elems_processed_per_iteration = 2; - num_elems_read_per_iteration_x = 8; - break; - case Format::YUV444: - num_elems_processed_per_iteration = 4; - num_elems_read_per_iteration_x = 16; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - break; - } - case Format::UYVY422: - case Format::YUYV422: - { - switch(output->info()->format()) - { - case Format::NV12: - case Format::IYUV: - num_elems_processed_per_iteration = 8; - num_elems_read_per_iteration_x = 8; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - break; - } - default: - break; - } - - ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", - string_from_format(input->info()->format()).c_str(), - string_from_format(output->info()->format()).c_str()); - - std::stringstream kernel_name; - - kernel_name << string_from_format(input->info()->format()); - kernel_name << "_to_"; - kernel_name << string_from_format(output->info()->format()); - kernel_name << "_bt709"; - _input = input; - _multi_output = output; - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name.str()); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444)) - { - win.set_dimension_step(Window::DimY, 2); - } - - AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration); - AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling); - AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, - num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling); - - AccessWindowHorizontal input_access(input->info(), 0, num_elems_read_per_iteration_x); - - update_window_and_padding(win, - input_access, - output_plane0_access, - output_plane1_access, - output_plane2_access); - - ValidRegion input_region = input->info()->valid_region(); - - output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape())); - output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape())); - output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape())); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name.str(); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvertKernel::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output) -{ - unsigned int num_elems_processed_per_iteration = 0; - switch(input->info()->format()) - { - case Format::NV12: - case Format::NV21: - { - switch(output->info()->format()) - { - case Format::IYUV: - case Format::YUV444: - num_elems_processed_per_iteration = 16; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - break; - } - case Format::IYUV: - { - switch(output->info()->format()) - { - case Format::YUV444: - case Format::NV12: - num_elems_processed_per_iteration = 16; - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - break; - } - break; - } - default: - break; - } - ARM_COMPUTE_ERROR_ON_MSG_VAR(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported", - string_from_format(input->info()->format()).c_str(), - string_from_format(output->info()->format()).c_str()); - - std::stringstream kernel_name; - - kernel_name << string_from_format(input->info()->format()); - kernel_name << "_to_"; - kernel_name << string_from_format(output->info()->format()); - kernel_name << "_bt709"; - - _multi_input = input; - _multi_output = output; - - // Create kernel - bool has_two_input_planars = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21); - bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21); - - float sub_sampling_input = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1; - float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1; - - _kernel = create_kernel(compile_context, kernel_name.str()); - - Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration)); - win.set_dimension_step(Window::DimY, 2); - - AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration); - AccessWindowRectangle input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, - sub_sampling_input, sub_sampling_input); - AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, - sub_sampling_input, sub_sampling_input); - AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration); - AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output); - AccessWindowRectangle output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0, - num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output); - - update_window_and_padding(win, - input_plane0_access, input_plane1_access, input_plane2_access, - output_plane0_access, output_plane1_access, output_plane2_access); - - ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(), - input->plane(2)->info()->valid_region()); - output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape())); - output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape())); - output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape())); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name.str(); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->plane(0)->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->plane(0)->info()->dimension(1)); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->plane(1)->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->plane(1)->info()->dimension(1)); -} - -void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - - if(nullptr != _input && nullptr != _output) - { - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); - } - else if(nullptr != _input && nullptr != _multi_output) - { - Format format = _multi_output->info()->format(); - do - { - Window win_uv(slice); - - if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format)) - { - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - } - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice); - for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i) - { - add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv); - } - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); - } - else if(nullptr != _multi_input && nullptr != _output) - { - Format format = _multi_input->info()->format(); - do - { - Window win_uv(slice); - - if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format)) - { - win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); - win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); - } - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice); - - for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i) - { - add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv); - } - add_2D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); - } - else if(nullptr != _multi_input && nullptr != _multi_output) - { - Format in_format = _multi_input->info()->format(); - Format out_format = _multi_output->info()->format(); - do - { - Window win_in_uv(slice); - if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format)) - { - win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2, - win_in_uv.x().end() / 2, win_in_uv.x().step() / 2)); - win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1)); - } - unsigned int idx = 0; - add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice); - for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i) - { - add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv); - } - - Window win_out_uv(slice); - if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format)) - { - win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2, - win_out_uv.x().end() / 2, win_out_uv.x().step() / 2)); - win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1)); - } - - add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice); - for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i) - { - add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv); - } - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); - } - else - { - ARM_COMPUTE_ERROR("Not supported"); - } -} diff --git a/src/core/CL/kernels/CLColorConvertKernel.h b/src/core/CL/kernels/CLColorConvertKernel.h deleted file mode 100644 index 0f082914cd..0000000000 --- a/src/core/CL/kernels/CLColorConvertKernel.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLCOLORCONVERTKERNEL_H -#define ARM_COMPUTE_CLCOLORCONVERTKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLMultiImage; -class ICLTensor; -using ICLImage = ICLTensor; - -/** Interface for the color convert kernel. - * - */ -class CLColorConvertKernel : public ICLKernel -{ -public: - /** Default constructor. */ - CLColorConvertKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLColorConvertKernel(const CLColorConvertKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLColorConvertKernel &operator=(const CLColorConvertKernel &) = delete; - /** Allow instances of this class to be moved */ - CLColorConvertKernel(CLColorConvertKernel &&) = default; - /** Allow instances of this class to be moved */ - CLColorConvertKernel &operator=(CLColorConvertKernel &&) = default; - /** Default destructor. */ - ~CLColorConvertKernel() = default; - - /** Set the input and output of the kernel - * - * @param[in] input Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888 - * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422), - * RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/), - * U8 (if the formats of @p input is RGB888) - */ - void configure(const ICLTensor *input, ICLTensor *output); - /** Set the input and output of the kernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888 - * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422), - * RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/), - * U8 (if the formats of @p input is RGB888) - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); - /** Set the input and output of the kernel - * - * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV - * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888 - */ - void configure(const ICLMultiImage *input, ICLImage *output); - /** Set the input and output of the kernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV - * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888 - */ - void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output); - /** Set the input and output of the kernel - * - * @param[in] input Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422 - * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888) - */ - void configure(const ICLImage *input, ICLMultiImage *output); - /** Set the input and output of the kernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422 - * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888) - */ - void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output); - /** Set the input and output of the kernel - * - * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV - * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of @p input is IYUV) - */ - void configure(const ICLMultiImage *input, ICLMultiImage *output); - /** Set the input and output of the kernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV - * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of @p input is IYUV) - */ - void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /*pointer to single planar tensor input */ - ICLTensor *_output; /*pointer to single planar tensor output */ - const ICLMultiImage *_multi_input; /*pointer to multi-planar input */ - ICLMultiImage *_multi_output; /*pointer to multi-planar output */ -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLCOLORCONVERTKERNEL_H */ diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp deleted file mode 100644 index 21f1047cc6..0000000000 --- a/src/core/CL/kernels/CLConvolutionKernel.cpp +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLConvolutionKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CL/ICLKernel.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace -{ -constexpr unsigned int max_matrix_size = 81; -} // namespace - -/****************************************************************************************\ - * Square Convolution * -\****************************************************************************************/ - -template -BorderSize CLConvolutionKernel::border_size() const -{ - return BorderSize(matrix_size / 2); -} - -template -void CLConvolutionKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_undefined); -} - -template -void CLConvolutionKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON(conv == nullptr); - - _input = input; - _output = output; - - std::stringstream kernel_name; - CLBuildOptions build_opts; - kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static"; - - if(scale == 0) - { - scale = calculate_matrix_scale(conv, matrix_size); - } - - for(unsigned int i = 0; i < matrix_size * matrix_size; i++) - { - std::stringstream mat_str; - mat_str << "-DMAT" << i << "=" << conv[i]; - build_opts.add_option(mat_str.str()); - } - - build_opts.add_option("-DSCALE=" + support::cpp11::to_string(scale)); - - DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - - std::stringstream out_type; - out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type()); - build_opts.add_option(out_type.str()); - - _kernel = create_kernel(compile_context, kernel_name.str(), build_opts.options()); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_rows_read_per_iteration = matrix_size; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); -} - -/****************************************************************************************\ - * Separable Convolution * -\****************************************************************************************/ -template -CLSeparableConvolutionHorKernel::CLSeparableConvolutionHorKernel() - : _border_size(0) -{ -} - -template -BorderSize CLSeparableConvolutionHorKernel::border_size() const -{ - return _border_size; -} - -template -void CLSeparableConvolutionHorKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, border_undefined); -} - -template -void CLSeparableConvolutionHorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32); - - ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9)); - - _input = input; - _output = output; - _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2); - - // Set build options - std::set build_opts; - - std::array mat = { 0 }; - memcpy(mat.data(), conv, matrix_size * sizeof(int16_t)); - - for(unsigned int j = 0; j < matrix_size * matrix_size; j++) - { - build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j])); - } - - build_opts.insert("-DSCALE=0"); - - build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); - - // Create kernel - const std::string kernel_name = "convolution_separable1x" + support::cpp11::to_string(matrix_size) + "_static"; - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - - Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} - -template -BorderSize CLSeparableConvolutionVertKernel::border_size() const -{ - return BorderSize{ matrix_size / 2, 0 }; -} - -template -void CLSeparableConvolutionVertKernel::configure(const ICLTensor *input, ICLTensor *output, - const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_undefined, data_type); -} - -template -void CLSeparableConvolutionVertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, - const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9)); - ARM_COMPUTE_ERROR_ON(scale == 0); - - _input = input; - _output = output; - - std::set build_opts; - - std::array mat = { 0 }; - memcpy(mat.data() + matrix_size, conv, matrix_size * sizeof(int16_t)); - - for(unsigned int j = 0; j < matrix_size * matrix_size; j++) - { - build_opts.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j])); - } - - build_opts.insert("-DSCALE=" + support::cpp11::to_string(scale)); - - build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - - build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type)); - - std::stringstream out_type; - out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type()); - build_opts.insert(out_type.str()); - - // Create kernel - const std::string kernel_name = "convolution_separable" + support::cpp11::to_string(matrix_size) + "x1_static"; - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 8; - constexpr unsigned int num_rows_read_per_iteration = matrix_size; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(data_type)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} - -/****************************************************************************************\ - * Rectangle Convolution * -\****************************************************************************************/ - -CLConvolutionRectangleKernel::CLConvolutionRectangleKernel() - : _border_size(0), _input(nullptr), _output(nullptr) -{ -} - -BorderSize CLConvolutionRectangleKernel::border_size() const -{ - return _border_size; -} - -void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, width, height, scale, border_undefined); -} - -void CLConvolutionRectangleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, - bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON(nullptr == conv); - ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width); - ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height); - ARM_COMPUTE_ERROR_ON(0 == scale); - - _input = input; - _output = output; - _border_size = BorderSize(height / 2, width / 2); - - std::set options; - - std::stringstream output_type; - output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type()); - options.insert(output_type.str()); - - uint32_t matrix_size = width * height; - - std::array mat = { 0 }; - - memcpy(mat.data(), conv, matrix_size * sizeof(int16_t)); - - for(unsigned int j = 0; j < max_matrix_size; j++) - { - options.insert("-DMAT" + support::cpp11::to_string(j) + "=" + support::cpp11::to_string(mat[j])); - } - - options.insert("-DSCALE=" + support::cpp11::to_string(scale)); - - DataType data_type = data_type_for_convolution_matrix(conv, matrix_size); - options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - - options.insert("-DMATRIX_WIDTH=" + support::cpp11::to_string(width)); - options.insert("-DMATRIX_HEIGHT=" + support::cpp11::to_string(height)); - - _kernel = create_kernel(compile_context, "convolution_rectangle", options); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - const unsigned int num_rows_read_per_iteration = height; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); -} - -void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} - -template class arm_compute::CLConvolutionKernel<3>; -template class arm_compute::CLConvolutionKernel<5>; -template class arm_compute::CLConvolutionKernel<7>; -template class arm_compute::CLConvolutionKernel<9>; -template class arm_compute::CLSeparableConvolutionVertKernel<5>; -template class arm_compute::CLSeparableConvolutionVertKernel<7>; -template class arm_compute::CLSeparableConvolutionVertKernel<9>; -template class arm_compute::CLSeparableConvolutionHorKernel<5>; -template class arm_compute::CLSeparableConvolutionHorKernel<7>; -template class arm_compute::CLSeparableConvolutionHorKernel<9>; -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLConvolutionKernel.h b/src/core/CL/kernels/CLConvolutionKernel.h deleted file mode 100644 index 33e73caf11..0000000000 --- a/src/core/CL/kernels/CLConvolutionKernel.h +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLCONVOLUTIONKERNEL_H -#define ARM_COMPUTE_CLCONVOLUTIONKERNEL_H - -#include "src/core/CL/ICLSimple2DKernel.h" - -#include - -namespace arm_compute -{ -class ICLTensor; - -/****************************************************************************************\ - * Square Convolution * -\****************************************************************************************/ - -/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9). - * The client can supply a convolution matrix \f$ C_{m,n} \f$. - * @f{eqnarray}{ - * k_0 &=& \frac{m}{2} \\ - * l_0 &=& \frac{n}{2} \\ - * sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l} - * @f} - * - * @note The above equation for this function is similar to the default OpenCV Filter2D function, - * which actually computes a correlation and not a convolution. - * In case of a real convolution the convolution matrix should be flipped both horizontally and vertically. - */ -template -class CLConvolutionKernel : public ICLSimple2DKernel -{ -public: - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U8, S16. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined); - /** Initialise the kernel's input, output and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U8, S16. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined); - - // Inherited methods overridden: - BorderSize border_size() const override; -}; - -/** Interface for the kernel which applies a 3x3 convolution to a tensor. */ -using CLConvolution3x3Kernel = CLConvolutionKernel<3>; -/** Interface for the kernel which applies a 5x5 convolution to a tensor. */ -using CLConvolution5x5Kernel = CLConvolutionKernel<5>; -/** Interface for the kernel which applies a 7x7 convolution to a tensor. */ -using CLConvolution7x7Kernel = CLConvolutionKernel<7>; -/** Interface for the kernel which applies a 9x9 convolution to a tensor. */ -using CLConvolution9x9Kernel = CLConvolutionKernel<9>; - -/****************************************************************************************\ - * Separable Square Convolution * -\****************************************************************************************/ - -/** Kernel for the Horizontal pass of a Separable Convolution. Currently support 5x5, 7x7, 9x9 */ -template -class CLSeparableConvolutionHorKernel : public ICLSimple2DKernel -{ -public: - /** Default Constructor */ - CLSeparableConvolutionHorKernel(); - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: S16. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined); - /** Initialise the kernel's input, output and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U16/S16/S32. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined); - - // Inherited methods overridden: - BorderSize border_size() const override; - -private: - BorderSize _border_size; /**< Border size */ -}; - -/** Interface for the kernel which applies a horizontal pass of 5x5 convolution to a tensor. */ -using CLSeparableConvolution5x5HorKernel = CLSeparableConvolutionHorKernel<5>; -/** Interface for the kernel which applies a horizontal pass of 7x7 convolution to a tensor. */ -using CLSeparableConvolution7x7HorKernel = CLSeparableConvolutionHorKernel<7>; -/** Interface for the kernel which applies a horizontal pass of 9x9 convolution to a tensor. */ -using CLSeparableConvolution9x9HorKernel = CLSeparableConvolutionHorKernel<9>; - -/** Kernel for the Vertical pass of a Separable Convolution. Currently supports 5x5, 7x7, 9x9 */ -template -class CLSeparableConvolutionVertKernel : public ICLSimple2DKernel -{ -public: - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source tensor. Data types supported: U16/S16/S32. - * @param[out] output Destination tensor, Data types supported: U8, S16. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] scale Scale of the convolution matrix. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - * @param[in] data_type Data type to use for intermeidate result. @sa data_type_for_convolution - */ - void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32); - /** Initialise the kernel's input, output and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U16/S16/S32. - * @param[out] output Destination tensor, Data types supported: U8, S16. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] scale Scale of the convolution matrix. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - * @param[in] data_type Data type to use for intermeidate result. @sa data_type_for_convolution - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32); - - // Inherited methods overridden: - BorderSize border_size() const override; -}; - -/** Interface for the kernel which applies a vertical pass of 5x5 convolution to a tensor. */ -using CLSeparableConvolution5x5VertKernel = CLSeparableConvolutionVertKernel<5>; -/** Interface for the kernel which applies a vertical pass of 7x7 convolution to a tensor. */ -using CLSeparableConvolution7x7VertKernel = CLSeparableConvolutionVertKernel<7>; -/** Interface for the kernel which applies a vertical pass of 9x9 convolution to a tensor. */ -using CLSeparableConvolution9x9VertKernel = CLSeparableConvolutionVertKernel<9>; - -/****************************************************************************************\ - * Rectangle Convolution * -\****************************************************************************************/ - -/** Kernel for the running convolution on a rectangle matrix. - * - * @note Supports combinations of 3,5,7 and 9. - */ -class CLConvolutionRectangleKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLConvolutionRectangleKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLConvolutionRectangleKernel(const CLConvolutionRectangleKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLConvolutionRectangleKernel &operator=(const CLConvolutionRectangleKernel &) = delete; - /** Allow instances of this class to be moved */ - CLConvolutionRectangleKernel(CLConvolutionRectangleKernel &&) = default; - /** Allow instances of this class to be moved */ - CLConvolutionRectangleKernel &operator=(CLConvolutionRectangleKernel &&) = default; - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U8, S16. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] width Width of convolution matrix (Number of columns) - * @param[in] height Height of convolution matrix (Number of rows) - * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined); - /** Initialise the kernel's input, output and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U8, S16. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] width Width of convolution matrix (Number of columns) - * @param[in] height Height of convolution matrix (Number of rows) - * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - BorderSize _border_size; - const ICLTensor *_input; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLCONVOLUTIONKERNEL_H */ diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp deleted file mode 100644 index 5ff11362cc..0000000000 --- a/src/core/CL/kernels/CLDerivativeKernel.cpp +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLDerivativeKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include - -using namespace arm_compute; - -CLDerivativeKernel::CLDerivativeKernel() - : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false) -{ -} - -BorderSize CLDerivativeKernel::border_size() const -{ - return BorderSize(1); -} - -void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined); -} - -void CLDerivativeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); - - _run_derivative_x = output_x != nullptr; - _run_derivative_y = output_y != nullptr; - - if(_run_derivative_x) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); - } - - if(_run_derivative_y) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); - } - - _input = input; - _output_x = output_x; - _output_y = output_y; - - // Set build options - std::set build_opts; - - if(_run_derivative_x) - { - build_opts.insert("-DGRAD_X"); - } - - if(_run_derivative_y) - { - build_opts.insert("-DGRAD_Y"); - } - - // Create kernel - const std::string kernel_name = std::string("derivative"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 16; - constexpr unsigned int num_read_rows_per_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), 0, 0, 0, 0); - AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration); - if(_run_derivative_x && _run_derivative_y) - { - // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2' - input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_read_rows_per_iteration); - } - else if(_run_derivative_x) - { - // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2' - input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration + 2); - } - else if(_run_derivative_y) - { - input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration); - } - - update_window_and_padding(win, - input_access, - output_x_access, - output_y_access); - - output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} - -void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument_if((_run_derivative_x), idx, _output_x, slice); - add_2D_tensor_argument_if((_run_derivative_y), idx, _output_y, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLDerivativeKernel.h b/src/core/CL/kernels/CLDerivativeKernel.h deleted file mode 100644 index 14dd05d084..0000000000 --- a/src/core/CL/kernels/CLDerivativeKernel.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLDERIVATIVEKERNEL_H -#define ARM_COMPUTE_CLDERIVATIVEKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the derivative kernel. */ -class CLDerivativeKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLDerivativeKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDerivativeKernel(const CLDerivativeKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDerivativeKernel &operator=(const CLDerivativeKernel &) = delete; - /** Allow instances of this class to be moved */ - CLDerivativeKernel(CLDerivativeKernel &&) = default; - /** Allow instances of this class to be moved */ - CLDerivativeKernel &operator=(CLDerivativeKernel &&) = default; - /** Default destructor */ - ~CLDerivativeKernel() = default; - /** Initialise the kernel's sources, destination and border - * - * @note At least one of output_x or output_y must be set - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - /** Initialise the kernel's sources, destination and border - * - * @note At least one of output_x or output_y must be set - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input; /**< Input tensor */ - ICLTensor *_output_x; /**< Output tensor - Derivate along the X direction */ - ICLTensor *_output_y; /**< Output tensor - Derivate along the Y direction */ - bool _run_derivative_x; /**< Do we need to run Derivative X ? */ - bool _run_derivative_y; /**< Do we need to run Derivative Y ? */ -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLDERIVATIVEKERNEL_H */ diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp deleted file mode 100644 index cac5bc1c72..0000000000 --- a/src/core/CL/kernels/CLDilateKernel.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLDilateKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" - -using namespace arm_compute; - -BorderSize CLDilateKernel::border_size() const -{ - return BorderSize(1); -} - -void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined); -} - -void CLDilateKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - - // Create kernel - _kernel = create_kernel(compile_context, "dilate"); - - _input = input; - _output = output; - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_rows_read_per_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); -} diff --git a/src/core/CL/kernels/CLDilateKernel.h b/src/core/CL/kernels/CLDilateKernel.h deleted file mode 100644 index 591ec8ccfc..0000000000 --- a/src/core/CL/kernels/CLDilateKernel.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLDILATEKERNEL_H -#define ARM_COMPUTE_CLDILATEKERNEL_H - -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the dilate kernel. - * - */ -class CLDilateKernel : public ICLSimple2DKernel -{ -public: - /**Initialise the kernel's input and output. - * - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined); - /**Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined); - - // Inherited methods overridden: - BorderSize border_size() const override; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLDILATEKERNEL_H */ diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp deleted file mode 100644 index f6d98a5488..0000000000 --- a/src/core/CL/kernels/CLErodeKernel.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLErodeKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" - -using namespace arm_compute; - -BorderSize CLErodeKernel::border_size() const -{ - return BorderSize(1); -} - -void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined); -} - -void CLErodeKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - - // Create kernel - _kernel = create_kernel(compile_context, "erode"); - - _input = input; - _output = output; - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_rows_read_pes_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); -} diff --git a/src/core/CL/kernels/CLErodeKernel.h b/src/core/CL/kernels/CLErodeKernel.h deleted file mode 100644 index 4da97ae358..0000000000 --- a/src/core/CL/kernels/CLErodeKernel.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLERODEKERNEL_H -#define ARM_COMPUTE_CLERODEKERNEL_H - -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the erode kernel. - * - */ -class CLErodeKernel : public ICLSimple2DKernel -{ -public: - /**Initialise the kernel's input and output. - * - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined); - /**Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined); - - // Inherited methods overridden: - BorderSize border_size() const override; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLERODEKERNEL_H */ diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp deleted file mode 100644 index 7481fd1c27..0000000000 --- a/src/core/CL/kernels/CLFastCornersKernel.cpp +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLFastCornersKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include - -using namespace arm_compute; - -CLFastCornersKernel::CLFastCornersKernel() - : ICLKernel(), _input(nullptr), _output(nullptr) -{ -} - -BorderSize CLFastCornersKernel::border_size() const -{ - return BorderSize(3); -} - -void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, non_max_suppression, border_mode); -} - -void CLFastCornersKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented"); - - _input = input; - _output = output; - - // Create build options - std::set build_opts; - - if(non_max_suppression) - { - build_opts.emplace("-DUSE_MAXSUPPRESSION"); - } - - // Create kernel - const std::string kernel_name = std::string("fast_corners"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Set static kernel arguments - unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters - _kernel.setArg(idx, static_cast(threshold)); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 1; - constexpr unsigned int num_elems_read_per_iteration = 7; - constexpr unsigned int num_rows_read_per_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3)); - - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(non_max_suppression); - _config_id += "_"; - _config_id += lower_string(string_from_border_mode(border_mode)); -} - -void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} - -CLCopyToArrayKernel::CLCopyToArrayKernel() - : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr) -{ -} - -void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, update_number, corners, num_buffers); -} - -void CLCopyToArrayKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(corners == nullptr); - ARM_COMPUTE_ERROR_ON(num_buffers == nullptr); - - _input = input; - _corners = corners; - _num_buffer = num_buffers; - - std::set build_opts; - - if(update_number) - { - build_opts.emplace("-DUPDATE_NUMBER"); - } - - // Create kernel - const std::string kernel_name = std::string("copy_to_keypoint"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - //Get how many pixels skipped in the x dimension in the previous stages - unsigned int offset = _input->info()->valid_region().anchor.x(); - - // Set static kernel arguments - unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters - _kernel.setArg(idx++, _corners->max_num_values()); - _kernel.setArg(idx++, offset); - _kernel.setArg(idx++, *_num_buffer); - _kernel.setArg(idx++, _corners->cl_buffer()); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 1; - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - //Initialise the _num_buffer as it used as both input and output - static const unsigned int zero_init = 0; - queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init); - - Window slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLFastCornersKernel.h b/src/core/CL/kernels/CLFastCornersKernel.h deleted file mode 100644 index 0c1b564c2f..0000000000 --- a/src/core/CL/kernels/CLFastCornersKernel.h +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLFASTCORNERSKERNEL_H -#define ARM_COMPUTE_CLFASTCORNERSKERNEL_H - -#include "arm_compute/core/CL/ICLArray.h" -#include "arm_compute/core/Types.h" -#include "src/core/CL/ICLKernel.h" - -#include - -namespace cl -{ -class Buffer; -} - -namespace arm_compute -{ -class ICLTensor; -using ICLImage = ICLTensor; - -/** CL kernel to perform fast corners */ -class CLFastCornersKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLFastCornersKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLFastCornersKernel(const CLFastCornersKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLFastCornersKernel &operator=(const CLFastCornersKernel &) = delete; - /** Allow instances of this class to be moved */ - CLFastCornersKernel(CLFastCornersKernel &&) = default; - /** Allow instances of this class to be moved */ - CLFastCornersKernel &operator=(CLFastCornersKernel &&) = default; - /** Default destructor */ - ~CLFastCornersKernel() = default; - - /** Initialise the kernel. - * - * @param[in] input Source image. Data types supported: U8. - * @param[out] output Output image. Data types supported: U8. - * @param[in] threshold Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3. - * @param[in] non_max_suppression True if non-maxima suppresion is applied, false otherwise. - * @param[in] border_mode Strategy to use for borders. - */ - void configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode); - /** Initialise the kernel. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source image. Data types supported: U8. - * @param[out] output Output image. Data types supported: U8. - * @param[in] threshold Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3. - * @param[in] non_max_suppression True if non-maxima suppresion is applied, false otherwise. - * @param[in] border_mode Strategy to use for borders. - */ - void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode); - - // Inherited methods overridden - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLImage *_input; - ICLImage *_output; -}; - -/** CL kernel to copy keypoints information to ICLKeyPointArray and counts the number of key points */ -class CLCopyToArrayKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLCopyToArrayKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLCopyToArrayKernel(const CLCopyToArrayKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLCopyToArrayKernel &operator=(const CLCopyToArrayKernel &) = delete; - /** Allow instances of this class to be moved */ - CLCopyToArrayKernel(CLCopyToArrayKernel &&) = default; - /** Allow instances of this class to be moved */ - CLCopyToArrayKernel &operator=(CLCopyToArrayKernel &&) = default; - /** Default destructor */ - ~CLCopyToArrayKernel() = default; - - /** Initialise the kernel. - * - * @param[in] input Source image. Data types supported: U8. - * @param[in] update_number Flag to indicate whether we need to update the number of corners - * @param[out] corners Array of keypoints to store the results. - * @param[out] num_buffers Number of keypoints to store the results. - */ - void configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers); - /** Initialise the kernel. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source image. Data types supported: U8. - * @param[in] update_number Flag to indicate whether we need to update the number of corners - * @param[out] corners Array of keypoints to store the results. - * @param[out] num_buffers Number of keypoints to store the results. - */ - void configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLImage *_input; /**< source image */ - ICLKeyPointArray *_corners; /**< destination array */ - cl::Buffer *_num_buffer; /**< CL memory to record number of key points in the array */ -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLFASTCORNERSKERNEL_H */ diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp deleted file mode 100644 index 40e9658ab4..0000000000 --- a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGaussian3x3Kernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include - -using namespace arm_compute; - -BorderSize CLGaussian3x3Kernel::border_size() const -{ - return BorderSize(1); -} - -void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined); -} - -void CLGaussian3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - - _input = input; - _output = output; - - // Set build options - std::set build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1", - "-DMAT3=2", "-DMAT4=4", "-DMAT5=2", - "-DMAT6=1", "-DMAT7=2", "-DMAT8=1", - "-DSCALE=16", "-DDATA_TYPE_OUT=uchar" - }; - - // Create kernel - _kernel = create_kernel(compile_context, "convolution3x3_static", build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_rows_read_per_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); -} diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.h b/src/core/CL/kernels/CLGaussian3x3Kernel.h deleted file mode 100644 index 139b05d44c..0000000000 --- a/src/core/CL/kernels/CLGaussian3x3Kernel.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H -#define ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H - -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the Gaussian 3x3 filter kernel. - * - */ -class CLGaussian3x3Kernel : public ICLSimple2DKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined); - - // Inherited methods overridden: - BorderSize border_size() const override; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H */ diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp deleted file mode 100644 index 46a7576154..0000000000 --- a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGaussian5x5Kernel.h" - -#include - -using namespace arm_compute; - -void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined); -} - -void CLGaussian5x5HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - const std::array matrix = { 1, 4, 6, 4, 1 }; - - // Set arguments - CLSeparableConvolution5x5HorKernel::configure(compile_context, input, output, matrix.data(), border_undefined); -} - -void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined); -} - -void CLGaussian5x5VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - const uint32_t scale = 256; - const std::array matrix = { 1, 4, 6, 4, 1 }; - - // Set arguments - CLSeparableConvolution5x5VertKernel::configure(compile_context, input, output, matrix.data(), scale, border_undefined); -} diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.h b/src/core/CL/kernels/CLGaussian5x5Kernel.h deleted file mode 100644 index 711710b3b3..0000000000 --- a/src/core/CL/kernels/CLGaussian5x5Kernel.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H -#define ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H - -#include "src/core/CL/kernels/CLConvolutionKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the kernel to run the horizontal pass of 5x5 Gaussian filter on a tensor. */ -class CLGaussian5x5HorKernel : public CLSeparableConvolution5x5HorKernel -{ -public: - /** Initialise the kernel's source, destination and border. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor. Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined); - /** Initialise the kernel's source, destination and border. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor. Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined); - -private: - //Make the configure method of the parent class private - using CLSeparableConvolution5x5HorKernel::configure; -}; - -/** Interface for the kernel to run the vertical pass of 5x5 Gaussian filter on a tensor. */ -class CLGaussian5x5VertKernel : public CLSeparableConvolution5x5VertKernel -{ -public: - /** Initialise the kernel's source, destination and border. - * - * @param[in] input Input tensor(output of horizontal pass). Data types supported: S16. - * @param[out] output Destination tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined); - /** Initialise the kernel's source, destination and border. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor(output of horizontal pass). Data types supported: S16. - * @param[out] output Destination tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined); - -private: - //Make the configure method of the parent class private - using CLSeparableConvolution5x5VertKernel::configure; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H */ diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp deleted file mode 100644 index 065f7f7e92..0000000000 --- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLGaussianPyramidKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -using namespace arm_compute; - -CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel() - : _l2_load_offset(0) -{ -} - -BorderSize CLGaussianPyramidHorKernel::border_size() const -{ - return BorderSize{ 0, 2 }; -} - -void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLGaussianPyramidHorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1)); - - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); - } - - _input = input; - _output = output; - - // Create kernel - const std::string kernel_name = std::string("gaussian1x5_sub_x"); - _kernel = create_kernel(compile_context, kernel_name); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 16; - constexpr unsigned int num_elems_read_per_iteration = 20; - constexpr unsigned int num_elems_written_per_iteration = 8; - const float scale_x = static_cast(output->info()->dimension(0)) / input->info()->dimension(0); - - Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x); - - // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even - // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether - // a pixel is even or odd is determined based on the tensor shape not the - // valid region!) - // Thus the offset from which the first pixel (L2) for the convolution is - // loaded depends on the anchor and shape of the valid region. - // In the case of an even shape (= even image width) we need to load L2 - // from -2 if the anchor is odd and from -1 if the anchor is even. That - // makes sure that L2 is always loaded from an odd pixel. - // On the other hand, for an odd shape (= odd image width) we need to load - // L2 from -1 if the anchor is odd and from -2 if the anchor is even to - // achieve the opposite effect. - // The condition can be simplified to checking whether anchor + shape is - // odd (-2) or even (-1) as only adding an odd and an even number will have - // an odd result. - _l2_load_offset = -border_size().left; - - if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0) - { - _l2_load_offset += 1; - } - - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration), - output_access); - - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); -} - -void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window win_in(window); - win_in.shift(Window::DimX, _l2_load_offset); - - //The output is half the width of the input: - Window win_out(window); - win_out.scale(Window::DimX, 0.5f); - - Window slice_in = win_in.first_slice_window_2D(); - Window slice_out = win_out.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out, lws_hint()); - } - while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out)); -} - -CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel() - : _t2_load_offset(0) -{ -} - -BorderSize CLGaussianPyramidVertKernel::border_size() const -{ - return BorderSize{ 2, 0 }; -} - -void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLGaussianPyramidVertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0)); - - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); - } - - _input = input; - _output = output; - - // Create kernel - const std::string kernel_name = std::string("gaussian5x1_sub_y"); - _kernel = create_kernel(compile_context, "gaussian5x1_sub_y"); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_rows_processed_per_iteration = 2; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 8; - constexpr unsigned int num_rows_per_iteration = 5; - - const float scale_y = static_cast(output->info()->dimension(1)) / input->info()->dimension(1); - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration)); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y); - - // Determine whether we need to load even or odd rows. See above for a - // detailed explanation. - _t2_load_offset = -border_size().top; - - if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0) - { - _t2_load_offset += 1; - } - - update_window_and_padding(win, - AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration), - output_access); - - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); -} - -void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - ARM_COMPUTE_ERROR_ON(window.x().step() != 8); - ARM_COMPUTE_ERROR_ON(window.y().step() % 2); - - Window win_in(window); - win_in.shift(Window::DimY, _t2_load_offset); - - Window win_out(window); - win_out.scale(Window::DimY, 0.5f); - - Window slice_in = win_in.first_slice_window_2D(); - Window slice_out = win_out.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice_out); - enqueue(queue, *this, slice_out, lws_hint()); - } - while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out)); -} diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.h b/src/core/CL/kernels/CLGaussianPyramidKernel.h deleted file mode 100644 index a6595440f6..0000000000 --- a/src/core/CL/kernels/CLGaussianPyramidKernel.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H -#define ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H - -#include "src/core/CL/ICLSimpleKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform a Gaussian filter and half scaling across width (horizontal pass) */ -class CLGaussianPyramidHorKernel : public ICLSimpleKernel -{ -public: - /** Default constructor */ - CLGaussianPyramidHorKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGaussianPyramidHorKernel(const CLGaussianPyramidHorKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGaussianPyramidHorKernel &operator=(const CLGaussianPyramidHorKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGaussianPyramidHorKernel(CLGaussianPyramidHorKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGaussianPyramidHorKernel &operator=(CLGaussianPyramidHorKernel &&) = default; - /** Default destructor */ - ~CLGaussianPyramidHorKernel() = default; - - /** Initialise the kernel's source, destination and border mode. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor. Output should have half the input width. Data types supported: U16. - */ - void configure(const ICLTensor *input, ICLTensor *output); - /** Initialise the kernel's source, destination and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor. Output should have half the input width. Data types supported: U16. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - int _l2_load_offset; -}; - -/** OpenCL kernel to perform a Gaussian filter and half scaling across height (vertical pass) */ -class CLGaussianPyramidVertKernel : public ICLSimpleKernel -{ -public: - /** Default constructor */ - CLGaussianPyramidVertKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGaussianPyramidVertKernel(const CLGaussianPyramidVertKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLGaussianPyramidVertKernel &operator=(const CLGaussianPyramidVertKernel &) = delete; - /** Allow instances of this class to be moved */ - CLGaussianPyramidVertKernel(CLGaussianPyramidVertKernel &&) = default; - /** Allow instances of this class to be moved */ - CLGaussianPyramidVertKernel &operator=(CLGaussianPyramidVertKernel &&) = default; - /** Default destructor */ - ~CLGaussianPyramidVertKernel() = default; - - /** Initialise the kernel's source, destination and border mode. - * - * @param[in] input Source tensor. Data types supported: U16. - * @param[out] output Destination tensor. Output should have half the input height. Data types supported: U8. - */ - void configure(const ICLTensor *input, ICLTensor *output); - /** Initialise the kernel's source, destination and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U16. - * @param[out] output Destination tensor. Output should have half the input height. Data types supported: U8. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - int _t2_load_offset; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H */ diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp deleted file mode 100644 index cd3f1ee216..0000000000 --- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLHOGDescriptorKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include -#include - -using namespace arm_compute; - -CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel() - : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size() -{ -} - -void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input_magnitude, input_phase, output, hog_info); -} - -void CLHOGOrientationBinningKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(hog_info == nullptr); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32); - ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX)); - ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY)); - - _input_magnitude = input_magnitude; - _input_phase = input_phase; - _output = output; - _cell_size = hog_info->cell_size(); - - float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f); - phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f); - - std::stringstream args_str; - args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " "; - args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " "; - args_str << "-DNUM_BINS=" << hog_info->num_bins() << " "; - args_str << "-DPHASE_SCALE=" << phase_scale << " "; - - // Construct kernel name - std::set build_opts = {}; - build_opts.insert(args_str.str()); - - // Create kernel - const std::string kernel_name = std::string("hog_orientation_binning"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - constexpr unsigned int num_elems_processed_per_iteration = 1; - constexpr unsigned int num_elems_read_per_iteration = 1; - const unsigned int num_rows_read_per_iteration = hog_info->cell_size().height; - constexpr unsigned int num_elems_written_per_iteration = 1; - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, - AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), - AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), - output_access); - - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input_magnitude->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input_magnitude->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input_magnitude->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); -} - -void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - // Compute slice for the magnitude and phase tensors - Window slice_mag_phase = window.first_slice_window_2D(); - slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width)); - slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height)); - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase); - add_2D_tensor_argument(idx, _input_phase, slice_mag_phase); - add_2D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} - -CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel() - : _input(nullptr), _output(nullptr), _num_cells_per_block_stride() -{ -} - -void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, hog_info); -} - -void CLHOGBlockNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info) -{ - ARM_COMPUTE_ERROR_ON(hog_info == nullptr); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); - - // Number of cells per block - const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width, - hog_info->block_size().height / hog_info->cell_size().height); - - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32); - - // Number of cells per block stride - const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width, - hog_info->block_stride().height / hog_info->cell_size().height); - - _input = input; - _output = output; - _num_cells_per_block_stride = num_cells_per_block_stride; - - std::stringstream args_str; - args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " "; - args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " "; - args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " "; - args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " "; - args_str << "-DL2_NORM=" << static_cast(HOGNormType::L2_NORM) << " "; - args_str << "-DL1_NORM=" << static_cast(HOGNormType::L1_NORM) << " "; - args_str << "-DL2HYS_NORM=" << static_cast(HOGNormType::L2HYS_NORM) << " "; - args_str << "-DHOG_NORM_TYPE=" << static_cast(hog_info->normalization_type()) << " "; - - // Construct kernel name - std::set build_opts = {}; - build_opts.insert(args_str.str()); - - const std::string kernel_name = std::string("hog_block_normalization"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - constexpr unsigned int num_elems_processed_per_iteration = 1; - constexpr unsigned int num_elems_read_per_iteration = 1; - const unsigned int num_rows_read_per_iteration = num_cells_per_block.height; - constexpr unsigned int num_elems_written_per_iteration = 1; - const unsigned int num_rows_written_per_iteration = num_cells_per_block.height; - - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration); - - update_window_and_padding(win, - AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), - output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); -} - -void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - // Compute slice for the magnitude and phase tensors - Window slice_in = window.first_slice_window_2D(); - slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width); - slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height); - - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice_in); - add_2D_tensor_argument(idx, _output, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.h b/src/core/CL/kernels/CLHOGDescriptorKernel.h deleted file mode 100644 index eee2fa36bc..0000000000 --- a/src/core/CL/kernels/CLHOGDescriptorKernel.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H -#define ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H - -#include "arm_compute/core/IHOG.h" -#include "arm_compute/core/Size2D.h" -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ITensor; - -/** OpenCL kernel to perform HOG Orientation Binning */ -class CLHOGOrientationBinningKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLHOGOrientationBinningKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHOGOrientationBinningKernel(const CLHOGOrientationBinningKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHOGOrientationBinningKernel &operator=(const CLHOGOrientationBinningKernel &) = delete; - /** Allow instances of this class to be moved */ - CLHOGOrientationBinningKernel(CLHOGOrientationBinningKernel &&) = default; - /** Allow instances of this class to be moved */ - CLHOGOrientationBinningKernel &operator=(CLHOGOrientationBinningKernel &&) = default; - /** Default destructor */ - ~CLHOGOrientationBinningKernel() = default; - - /** Initialise the kernel's inputs, output and HOG's metadata - * - * @param[in] input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16. - * @param[in] input_phase Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8 - * @param[out] output Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[in] hog_info HOG's metadata - */ - void configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info); - /** Initialise the kernel's inputs, output and HOG's metadata - * - * @param[in] compile_context The compile context to be used. - * @param[in] input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16. - * @param[in] input_phase Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8 - * @param[out] output Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[in] hog_info HOG's metadata - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input_magnitude; - const ICLTensor *_input_phase; - ICLTensor *_output; - Size2D _cell_size; -}; - -/** OpenCL kernel to perform HOG block normalization */ -class CLHOGBlockNormalizationKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLHOGBlockNormalizationKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHOGBlockNormalizationKernel(const CLHOGBlockNormalizationKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHOGBlockNormalizationKernel &operator=(const CLHOGBlockNormalizationKernel &) = delete; - /** Allow instances of this class to be moved */ - CLHOGBlockNormalizationKernel(CLHOGBlockNormalizationKernel &&) = default; - /** Allow instances of this class to be moved */ - CLHOGBlockNormalizationKernel &operator=(CLHOGBlockNormalizationKernel &&) = default; - /** Default destructor */ - ~CLHOGBlockNormalizationKernel() = default; - - /** Initialise the kernel's input, output and HOG's metadata - * - * @param[in] input Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[out] output Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block - * @param[in] hog_info HOG's metadata - */ - void configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info); - /** Initialise the kernel's input, output and HOG's metadata - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell - * @param[out] output Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block - * @param[in] hog_info HOG's metadata - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - ICLTensor *_output; - Size2D _num_cells_per_block_stride; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H */ diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp deleted file mode 100644 index 861155b9a2..0000000000 --- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLHOGDetectorKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLHOG.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -using namespace arm_compute; - -CLHOGDetectorKernel::CLHOGDetectorKernel() - : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr) -{ -} - -void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, - float threshold, uint16_t idx_class) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, num_detection_windows, detection_window_stride, threshold, idx_class); -} - -void CLHOGDetectorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, - const Size2D &detection_window_stride, - float threshold, uint16_t idx_class) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32); - ARM_COMPUTE_ERROR_ON(hog == nullptr); - ARM_COMPUTE_ERROR_ON(detection_windows == nullptr); - ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr); - ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0); - ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0); - - const Size2D &detection_window_size = hog->info()->detection_window_size(); - const Size2D &block_size = hog->info()->block_size(); - const Size2D &block_stride = hog->info()->block_stride(); - - _input = input; - _detection_windows = detection_windows; - _num_detection_windows = num_detection_windows; - - const unsigned int num_bins_per_descriptor_x = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels(); - const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1; - - ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size()); - - std::stringstream args_str; - args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " "; - args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " "; - args_str << "-DTHRESHOLD=" << threshold << " "; - args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " "; - args_str << "-DIDX_CLASS=" << idx_class << " "; - args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " "; - args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " "; - args_str << "-DDETECTION_WINDOW_STRIDE_WIDTH=" << detection_window_stride.width << " "; - args_str << "-DDETECTION_WINDOW_STRIDE_HEIGHT=" << detection_window_stride.height << " "; - - // Construct kernel name - std::set build_opts = {}; - build_opts.insert(args_str.str()); - - // Create kernel - const std::string kernel_name = std::string("hog_detector"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Set static kernel arguments - unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters - _kernel.setArg(idx++, hog->cl_buffer()); - _kernel.setArg(idx++, detection_windows->cl_buffer()); - _kernel.setArg(idx++, *_num_detection_windows); - - // Get the number of blocks along the x and y directions of the input tensor - const ValidRegion &valid_region = input->info()->valid_region(); - const size_t num_blocks_x = valid_region.shape[0]; - const size_t num_blocks_y = valid_region.shape[1]; - - // Get the number of blocks along the x and y directions of the detection window - const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width; - const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height; - - const size_t window_step_x = detection_window_stride.width / block_stride.width; - const size_t window_step_y = detection_window_stride.height / block_stride.height; - - // Configure kernel window - Window win; - win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x) + window_step_x, window_step_x)); - win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y) + window_step_y, window_step_y)); - - constexpr unsigned int num_elems_read_per_iteration = 1; - const unsigned int num_rows_read_per_iteration = num_blocks_per_descriptor_y; - - update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration)); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.h b/src/core/CL/kernels/CLHOGDetectorKernel.h deleted file mode 100644 index c28e6ebe74..0000000000 --- a/src/core/CL/kernels/CLHOGDetectorKernel.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLHOGDETECTORKERNEL_H -#define ARM_COMPUTE_CLHOGDETECTORKERNEL_H - -#include "arm_compute/core/CL/ICLArray.h" -#include "arm_compute/core/CL/ICLHOG.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "src/core/CL/ICLKernel.h" - -namespace cl -{ -class Buffer; -} - -namespace arm_compute -{ -class ICLTensor; - -/** OpenCL kernel to perform HOG detector kernel using linear SVM */ -class CLHOGDetectorKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLHOGDetectorKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHOGDetectorKernel(const CLHOGDetectorKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHOGDetectorKernel &operator=(const CLHOGDetectorKernel &) = delete; - /** Allow instances of this class to be moved */ - CLHOGDetectorKernel(CLHOGDetectorKernel &&) = default; - /** Allow instances of this class to be moved */ - CLHOGDetectorKernel &operator=(CLHOGDetectorKernel &&) = default; - /** Default destructor */ - ~CLHOGDetectorKernel() = default; - - /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect - * - * @param[in] input Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block - * @param[in] hog HOG data object used by @ref CLHOGOrientationBinningKernel and @ref CLHOGBlockNormalizationKernel - * @param[out] detection_windows Array of @ref DetectionWindow. This array stores all the detected objects - * @param[in] num_detection_windows Number of detected objects - * @param[in] detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions. - * It must be multiple of the hog->info()->block_stride() - * @param[in] threshold (Optional) Threshold for the distance between features and SVM classifying plane - * @param[in] idx_class (Optional) Index of the class used for evaluating which class the detection window belongs to - */ - void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, - uint16_t idx_class = 0); - /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block - * @param[in] hog HOG data object used by @ref CLHOGOrientationBinningKernel and @ref CLHOGBlockNormalizationKernel - * @param[out] detection_windows Array of @ref DetectionWindow. This array stores all the detected objects - * @param[in] num_detection_windows Number of detected objects - * @param[in] detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions. - * It must be multiple of the hog->info()->block_stride() - * @param[in] threshold (Optional) Threshold for the distance between features and SVM classifying plane - * @param[in] idx_class (Optional) Index of the class used for evaluating which class the detection window belongs to - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, - const Size2D &detection_window_stride, float threshold = 0.0f, - uint16_t idx_class = 0); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue); - -private: - const ICLTensor *_input; - ICLDetectionWindowArray *_detection_windows; - cl::Buffer *_num_detection_windows; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLHOGDETECTORKERNEL_H */ diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp deleted file mode 100644 index cbc056fb77..0000000000 --- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLHarrisCornersKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include -#include - -using namespace arm_compute; - -CLHarrisScoreKernel::CLHarrisScoreKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0) -{ -} - -BorderSize CLHarrisScoreKernel::border_size() const -{ - return _border_size; -} - -void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output, - int32_t block_size, float norm_factor, float strength_thresh, float sensitivity, - bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, block_size, norm_factor, strength_thresh, sensitivity, border_undefined); -} - -void CLHarrisScoreKernel::configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output, - int32_t block_size, float norm_factor, float strength_thresh, float sensitivity, - bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7)); - ARM_COMPUTE_ERROR_ON(0.0f == norm_factor); - - _input1 = input1; - _input2 = input2; - _output = output; - _sensitivity = sensitivity; - _strength_thresh = strength_thresh; - _norm_factor = norm_factor; - _border_size = BorderSize(block_size / 2); - - // Select kernel - std::stringstream harris_score_kernel_name; - harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size; - - // Create build options - std::set build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) }; - - // Create kernel - _kernel = create_kernel(compile_context, harris_score_kernel_name.str(), build_opts); - - // Set static kernel arguments - unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, sensitivity); - _kernel.setArg(idx++, strength_thresh); - _kernel.setArg(idx++, norm_factor); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 4; - constexpr unsigned int num_elems_written_per_iteration = 4; - const unsigned int num_elems_read_per_iteration = block_size == 7 ? 10 : 8; - const unsigned int num_rows_read_per_iteration = block_size; - - Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowRectangle input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input1_access, input2_access, output_access); - - ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region()); - output_access.set_valid_region(win, valid_region, border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = harris_score_kernel_name.str(); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input1->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input1->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input1->info()->dimension(1)); - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input2->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input2->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input2->info()->dimension(1)); -} - -void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input1, slice); - add_2D_tensor_argument(idx, _input2, slice); - add_2D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.h b/src/core/CL/kernels/CLHarrisCornersKernel.h deleted file mode 100644 index 6482b0aa4e..0000000000 --- a/src/core/CL/kernels/CLHarrisCornersKernel.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLHARRISCORNERSKERNEL_H -#define ARM_COMPUTE_CLHARRISCORNERSKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -#include - -namespace arm_compute -{ -class ICLTensor; -using ICLImage = ICLTensor; - -/** Interface for the harris score kernel. - * - * @note The implementation supports 3, 5, and 7 for the block_size. - */ -class CLHarrisScoreKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLHarrisScoreKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHarrisScoreKernel(const CLHarrisScoreKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHarrisScoreKernel &operator=(const CLHarrisScoreKernel &) = delete; - /** Allow instances of this class to be moved */ - CLHarrisScoreKernel(CLHarrisScoreKernel &&) = default; - /** Allow instances of this class to be moved */ - CLHarrisScoreKernel &operator=(CLHarrisScoreKernel &&) = default; - /** Default destructor */ - ~CLHarrisScoreKernel() = default; - - /** Setup the kernel parameters - * - * @param[in] input1 Source image (gradient X). Data types supported S16, S32. (Must be the same as input2) - * @param[in] input2 Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1) - * @param[out] output Destination image (harris score). Data types supported F32 - * @param[in] block_size The block window size used to compute the Harris Corner score. Supports: 3, 5 and 7 - * @param[in] norm_factor Normalization factor to use accordingly with the gradient size (Must be different from 0) - * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel). - * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output, - int32_t block_size, float norm_factor, float strength_thresh, float sensitivity, - bool border_undefined); - /** Setup the kernel parameters - * - * @param[in] compile_context The compile context to be used. - * @param[in] input1 Source image (gradient X). Data types supported S16, S32. (Must be the same as input2) - * @param[in] input2 Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1) - * @param[out] output Destination image (harris score). Data types supported F32 - * @param[in] block_size The block window size used to compute the Harris Corner score. Supports: 3, 5 and 7 - * @param[in] norm_factor Normalization factor to use accordingly with the gradient size (Must be different from 0) - * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel). - * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output, - int32_t block_size, float norm_factor, float strength_thresh, float sensitivity, - bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -protected: - const ICLImage *_input1; /**< Source image - Gx component */ - const ICLImage *_input2; /**< Source image - Gy component */ - ICLImage *_output; /**< Source image - Harris score */ - float _sensitivity; /**< Sensitivity value */ - float _strength_thresh; /**< Threshold value */ - float _norm_factor; /**< Normalization factor */ - BorderSize _border_size; /**< Border size */ -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLHARRISCORNERSKERNEL_H */ diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp deleted file mode 100644 index ca5322aa51..0000000000 --- a/src/core/CL/kernels/CLHistogramKernel.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLHistogramKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLDistribution1D.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include - -using namespace arm_compute; - -// each thread handle 16 pixels -constexpr signed int pixels_per_item = 16; - -// local work group size in X dimension -constexpr unsigned int local_x_size = 16; - -CLHistogramKernel::CLHistogramKernel() - : _input(nullptr), _output(nullptr) -{ -} - -void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLHistogramKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - - // Check input size - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - // Check offset - ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range."); - - // Check range - ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range."); - - _input = input; - _output = output; - - if(_input->info()->dimension(0) < pixels_per_item) - { - return; - } - - unsigned int num_bins = _output->num_bins(); - unsigned int window_size = _output->window(); - unsigned int offset = _output->offset(); - unsigned int range = _output->range(); - unsigned int offrange = offset + range; - unsigned int bin_size = _output->size(); - unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions - - // Create kernel - bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange); - const std::string kernel_name = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel"; - _kernel = create_kernel(compile_context, kernel_name); - - // Set static kernel arguments - unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, buffer_size, nullptr); - _kernel.setArg(idx++, _output->cl_buffer()); - if(!is_fixed_size) - { - _kernel.setArg(idx++, num_bins); - _kernel.setArg(idx++, offset); - _kernel.setArg(idx++, range); - _kernel.setArg(idx++, offrange); - } - - // We only run histogram on Image, therefore only 2 dimensions here - unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item; - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(0, end_position, pixels_per_item)); - win.set(1, Window::Dimension(0, _input->info()->dimension(1))); - - update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item)); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - // TODO (COMPMID-679): Add CLMemFill - _output->map(queue, true); - ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr); - memset(_output->buffer(), 0, _output->size()); - _output->unmap(queue); - - if(_input->info()->dimension(0) < pixels_per_item) - { - return; - } - - Window slice = window.first_slice_window_2D(); - const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); - cl::NDRange lws = (local_x_size < gws_x) ? cl::NDRange(local_x_size, 1) : cl::NDRange(1, 1); - - do - { - /* Run the core part which has width can be divided by 16 */ - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - - enqueue(queue, *this, slice, lws); - } - while(window.slide_window_slice_2D(slice)); -} - -CLHistogramBorderKernel::CLHistogramBorderKernel() - : _input(nullptr), _output(nullptr) -{ -} - -void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLHistogramBorderKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON(nullptr == output); - - // Check input size - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - // Check offset - ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range."); - - // Check range - ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range."); - - // We only run histogram on Image, therefore only 2 dimensions here - unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item; - - if(start_position >= input->info()->dimension(0)) - { - return; // no need to run histogram border kernel - } - - _input = input; - _output = output; - - unsigned int num_bins = _output->num_bins(); - unsigned int window_size = _output->window(); - unsigned int offset = _output->offset(); - unsigned int range = _output->range(); - unsigned int offrange = offset + range; - - // Create kernel - bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange); - const std::string kernel_name = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel"; - _kernel = create_kernel(compile_context, kernel_name); - - // Set static kernel arguments - unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, _output->cl_buffer()); - if(!is_fixed_size) - { - _kernel.setArg(idx++, num_bins); - _kernel.setArg(idx++, offset); - _kernel.setArg(idx++, range); - _kernel.setArg(idx++, offrange); - } - - // Configure kernel window - Window win; - win.set(0, Window::Dimension(start_position, _input->info()->dimension(0))); - win.set(1, Window::Dimension(0, _input->info()->dimension(1))); - update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1)); - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); -} - -void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue) -{ - if(window.x().start() >= window.x().end()) - { - return; - } - - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - cl::NDRange lws = cl::NDRange(1, 1); - - Window slice = window.first_slice_window_2D(); - - do - { - /* Run the border part which has width cannot be divided by 16 */ - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - - enqueue(queue, *this, slice, lws); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLHistogramKernel.h b/src/core/CL/kernels/CLHistogramKernel.h deleted file mode 100644 index 9c97c6590d..0000000000 --- a/src/core/CL/kernels/CLHistogramKernel.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLHISTOGRAMKERNEL_H -#define ARM_COMPUTE_CLHISTOGRAMKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLDistribution1D; -class ICLTensor; -using ICLImage = ICLTensor; - -/** Interface to run the histogram kernel. This kernel processes the part of image with width can be divided by 16. - * If the image width is not a multiple of 16, remaining pixels have to be processed with the @ref CLHistogramBorderKernel - */ -class CLHistogramKernel : public ICLKernel -{ -public: - /** Constructor */ - CLHistogramKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHistogramKernel(const CLHistogramKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHistogramKernel &operator=(const CLHistogramKernel &) = delete; - /** Allow instances of this class to be moved */ - CLHistogramKernel(CLHistogramKernel &&) = default; - /** Allow instances of this class to be moved */ - CLHistogramKernel &operator=(CLHistogramKernel &&) = default; - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source image. Data types supported: U8. - * @param[out] output Destination distribution. - */ - void configure(const ICLImage *input, ICLDistribution1D *output); - /** Initialise the kernel's input, output and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source image. Data types supported: U8. - * @param[out] output Destination distribution. - */ - void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLImage *_input; - ICLDistribution1D *_output; -}; - -/** Interface to run the histogram kernel to handle the leftover part of image - * - */ -class CLHistogramBorderKernel : public ICLKernel -{ -public: - /** Constructor */ - CLHistogramBorderKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHistogramBorderKernel(const CLHistogramBorderKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLHistogramBorderKernel &operator=(const CLHistogramBorderKernel &) = delete; - /** Allow instances of this class to be moved */ - CLHistogramBorderKernel(CLHistogramBorderKernel &&) = default; - /** Allow instances of this class to be moved */ - CLHistogramBorderKernel &operator=(CLHistogramBorderKernel &&) = default; - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source image. Data types supported: U8. - * @param[out] output Destination distribution. - */ - void configure(const ICLImage *input, ICLDistribution1D *output); - /** Initialise the kernel's input, output and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source image. Data types supported: U8. - * @param[out] output Destination distribution. - */ - void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLImage *_input; - ICLDistribution1D *_output; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLHISTOGRAMKERNEL_H*/ diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp deleted file mode 100644 index 5e5683d231..0000000000 --- a/src/core/CL/kernels/CLIntegralImageKernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLIntegralImageKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include - -using namespace arm_compute; - -void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLIntegralImageHorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32); - - _input = input; - _output = output; - - // Create kernel - const std::string kernel_name = std::string("integral_horizontal"); - _kernel = create_kernel(compile_context, kernel_name); - - // Configure kernel window - const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0); - const unsigned int num_elems_accessed_per_iteration = ceil_to_multiple(num_elems_processed_per_iteration, 16); - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration); - - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration), - output_access); - - output_access.set_valid_region(win, input->info()->valid_region()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); -} - -CLIntegralImageVertKernel::CLIntegralImageVertKernel() - : _in_out(nullptr) -{ -} - -void CLIntegralImageVertKernel::configure(ICLTensor *in_out) -{ - configure(CLKernelLibrary::get().get_compile_context(), in_out); -} - -void CLIntegralImageVertKernel::configure(const CLCompileContext &compile_context, ICLTensor *in_out) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32); - - _in_out = in_out; - - // Create kernel - const std::string kernel_name = std::string("integral_vertical"); - _kernel = create_kernel(compile_context, kernel_name); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration_x = 8; - const unsigned int num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY); - - Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - - update_window_and_padding(win, in_out_access); - - in_out_access.set_valid_region(win, in_out->info()->valid_region()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(in_out->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(in_out->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(in_out->info()->dimension(1)); -} - -void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const size_t height = _in_out->info()->dimension(1); - - Window slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _in_out, slice); - _kernel.setArg(idx++, height); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLIntegralImageKernel.h b/src/core/CL/kernels/CLIntegralImageKernel.h deleted file mode 100644 index 0e40e3afbc..0000000000 --- a/src/core/CL/kernels/CLIntegralImageKernel.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H -#define ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H - -#include "src/core/CL/ICLKernel.h" -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface to run the horizontal pass of the integral image kernel. */ -class CLIntegralImageHorKernel : public ICLSimple2DKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output Destination tensor, Data types supported: U32. - */ - void configure(const ICLTensor *input, ICLTensor *output); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output Destination tensor, Data types supported: U32. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); -}; - -/** Interface to run the vertical pass of the integral image kernel. */ -class CLIntegralImageVertKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLIntegralImageVertKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLIntegralImageVertKernel(const CLIntegralImageVertKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLIntegralImageVertKernel &operator=(const CLIntegralImageVertKernel &) = delete; - /** Allow instances of this class to be moved */ - CLIntegralImageVertKernel(CLIntegralImageVertKernel &&) = default; - /** Allow instances of this class to be moved */ - CLIntegralImageVertKernel &operator=(CLIntegralImageVertKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in,out] in_out The input/output tensor. Data types supported: U32 - */ - void configure(ICLTensor *in_out); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in,out] in_out The input/output tensor. Data types supported: U32 - */ - void configure(const CLCompileContext &compile_context, ICLTensor *in_out); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - ICLTensor *_in_out; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H */ diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp deleted file mode 100644 index 9845dd6169..0000000000 --- a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include - -using namespace arm_compute; - -CLMagnitudePhaseKernel::CLMagnitudePhaseKernel() - : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false) -{ -} - -void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, - MagnitudeType mag_type, PhaseType phase_type) -{ - configure(CLKernelLibrary::get().get_compile_context(), gx, gy, magnitude, phase, mag_type, phase_type); -} - -void CLMagnitudePhaseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, - MagnitudeType mag_type, PhaseType phase_type) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32); - ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr)); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy); - - _run_mag = (magnitude != nullptr); - _run_phase = (phase != nullptr); - if(_run_mag) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude); - } - if(_run_phase) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); - } - - if(!_run_mag && !_run_phase) - { - ARM_COMPUTE_ERROR("At least one output must be NOT NULL"); - } - - _gx = gx; - _gy = gy; - _magnitude = magnitude; - _phase = phase; - - // Construct kernel name - std::set build_opts = {}; - - // Add magnitude type - if(_run_mag) - { - switch(mag_type) - { - case MagnitudeType::L1NORM: - build_opts.insert("-DMAGNITUDE=1"); - break; - case MagnitudeType::L2NORM: - build_opts.insert("-DMAGNITUDE=2"); - break; - default: - ARM_COMPUTE_ERROR("Unsupported magnitude calculation type."); - build_opts.insert("-DMAGNITUDE=0"); - break; - } - } - - // Add phase type - if(_run_phase) - { - switch(phase_type) - { - case PhaseType::UNSIGNED: - build_opts.insert("-DPHASE=1"); - break; - case PhaseType::SIGNED: - build_opts.insert("-DPHASE=2"); - break; - default: - ARM_COMPUTE_ERROR("Unsupported phase calculation type."); - build_opts.insert("-DPHASE=0"); - break; - } - } - - // Add data_type - build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type())); - - // Create kernel - const std::string kernel_name = std::string("magnitude_phase"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 16; - - Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - gx_access, gy_access, - output_magnitude_access, output_phase_access); - - ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(), - gy->info()->valid_region()); - output_magnitude_access.set_valid_region(win, valid_region); - output_phase_access.set_valid_region(win, valid_region); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(gx->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(gx->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(gx->info()->dimension(1)); -} - -void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _gx, slice); - add_2D_tensor_argument(idx, _gy, slice); - add_2D_tensor_argument_if((_run_mag), idx, _magnitude, slice); - add_2D_tensor_argument_if((_run_phase), idx, _phase, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.h b/src/core/CL/kernels/CLMagnitudePhaseKernel.h deleted file mode 100644 index 514036b2ff..0000000000 --- a/src/core/CL/kernels/CLMagnitudePhaseKernel.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H -#define ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Template interface for the kernel to compute magnitude and phase. - * - */ -class CLMagnitudePhaseKernel : public ICLKernel -{ -public: - /** Default constructor. */ - CLMagnitudePhaseKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMagnitudePhaseKernel(const CLMagnitudePhaseKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMagnitudePhaseKernel &operator=(const CLMagnitudePhaseKernel &) = delete; - /** Allow instances of this class to be moved */ - CLMagnitudePhaseKernel(CLMagnitudePhaseKernel &&) = default; - /** Allow instances of this class to be moved */ - CLMagnitudePhaseKernel &operator=(CLMagnitudePhaseKernel &&) = default; - /** Initialise the kernel's input, output. - * - * @note At least one of output1 or output2 must be set. - * - * @param[in] gx The input gradient X tensor. Data types supported: S16/S32. - * @param[in] gy The input gradient Y tensor. Data types supported: S16/S32. - * @param[out] magnitude (Optional) The output tensor - Magnitude. Data types supported: S16/S32. - * @param[out] phase (Optional) The output tensor - Phase. Data types supported: U8. - * @param[in] mag_type (Optional) Magnitude calculation type. Default: L2NORM. - * @param[in] phase_type (Optional) Phase calculation type. Default: SIGNED. - */ - void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, - MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED); - /** Initialise the kernel's input, output. - * - * @note At least one of output1 or output2 must be set. - * - * @param[in] compile_context The compile context to be used. - * @param[in] gx The input gradient X tensor. Data types supported: S16/S32. - * @param[in] gy The input gradient Y tensor. Data types supported: S16/S32. - * @param[out] magnitude (Optional) The output tensor - Magnitude. Data types supported: S16/S32. - * @param[out] phase (Optional) The output tensor - Phase. Data types supported: U8. - * @param[in] mag_type (Optional) Magnitude calculation type. Default: L2NORM. - * @param[in] phase_type (Optional) Phase calculation type. Default: SIGNED. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, - MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_gx; /**< Input gradient X. */ - const ICLTensor *_gy; /**< Input gradient Y. */ - ICLTensor *_magnitude; /**< Output - Magnitude. */ - ICLTensor *_phase; /**< Output - Phase. */ - bool _run_mag; /**< Calculate magnitude ? */ - bool _run_phase; /**< Calculate phase ? */ -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H */ diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp deleted file mode 100644 index aed6e6eaf7..0000000000 --- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLMeanStdDevKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include -#include - -using namespace arm_compute; - -CLMeanStdDevKernel::CLMeanStdDevKernel() - : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _border_size(0) -{ -} - -BorderSize CLMeanStdDevKernel::border_size() const -{ - return _border_size; -} - -Status CLMeanStdDevKernel::validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared) -{ - ARM_COMPUTE_UNUSED(mean); - ARM_COMPUTE_UNUSED(stddev); - ARM_COMPUTE_UNUSED(global_sum); - ARM_COMPUTE_UNUSED(global_sum_squared); - ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED(); - ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - return Status{}; -} - -void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, mean, global_sum, stddev, global_sum_squared); -} - -void CLMeanStdDevKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, global_sum); - ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared); - ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevKernel::validate(input->info(), mean, global_sum, stddev, global_sum_squared)); - - _input = input; - _mean = mean; - _stddev = stddev; - _global_sum = global_sum; - _global_sum_squared = global_sum_squared; - - // Create kernel - std::set build_opts; - - if(_stddev != nullptr) - { - build_opts.insert("-DSTDDEV"); - } - - _kernel = create_kernel(compile_context, "mean_stddev_accumulate", build_opts); - - // Set fixed arguments - unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters - - _kernel.setArg(idx++, static_cast(input->info()->dimension(1))); - _kernel.setArg(idx++, *_global_sum); - - if(_stddev != nullptr) - { - _kernel.setArg(idx++, *_global_sum_squared); - } - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration_x = 8; - const unsigned int num_elems_processed_per_iteration_y = input->info()->dimension(1); - - _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - input->info()->dimension(0)); - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - update_window_and_padding(win, input_access); - - ICLKernel::configure_internal(win); -} - -void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - // Clear sums - static const cl_ulong zero = 0; - queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero); - - if(_stddev != nullptr) - { - queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero); - } - - Window slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - // Set slice step equal to height to force gws[1] to 1, - // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item - slice.set_dimension_step(Window::DimY, _input->info()->dimension(1)); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); - - // Calculate mean and stddev - cl_ulong global_sum = 0; - cl_ulong global_sum_squared = 0; - const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1); - - queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast(&global_sum)); - const float mean = global_sum / num_pixels; - *_mean = mean; - - if(_stddev != nullptr) - { - queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast(&global_sum_squared)); - *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean)); - } -} diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.h b/src/core/CL/kernels/CLMeanStdDevKernel.h deleted file mode 100644 index 179a2025b7..0000000000 --- a/src/core/CL/kernels/CLMeanStdDevKernel.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLMEANSTDDEVKERNEL_H -#define ARM_COMPUTE_CLMEANSTDDEVKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace cl -{ -class Buffer; -} - -namespace arm_compute -{ -class ICLTensor; -using ICLImage = ICLTensor; - -/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */ -class CLMeanStdDevKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLMeanStdDevKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMeanStdDevKernel(const CLMeanStdDevKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMeanStdDevKernel &operator=(const CLMeanStdDevKernel &) = delete; - /** Allow instances of this class to be moved */ - CLMeanStdDevKernel(CLMeanStdDevKernel &&) = default; - /** Allow instances of this class to be moved */ - CLMeanStdDevKernel &operator=(CLMeanStdDevKernel &&) = default; - /** Initialise the kernel's input and outputs. - * - * @param[in] input Input image. Data types supported: U8. - * @param[out] mean Input average pixel value. - * @param[out] global_sum Keeps global sum of pixel values (Buffer size: 1 cl_ulong). - * @param[out] stddev (Optional) Output standard deviation of pixel values. - * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong). - */ - void configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr); - /** Initialise the kernel's input and outputs. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input image. Data types supported: U8. - * @param[out] mean Input average pixel value. - * @param[out] global_sum Keeps global sum of pixel values (Buffer size: 1 cl_ulong). - * @param[out] stddev (Optional) Output standard deviation of pixel values. - * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong). - */ - void configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr); - /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevKernel. - * - * @param[in] input Input image info. Data types supported: U8. - * @param[in] mean Input average pixel value. - * @param[in] global_sum Keeps global sum of pixel values. - * @param[in] stddev (Optional) Output standard deviation of pixel values. - * @param[in] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - - BorderSize border_size() const override; - -private: - const ICLImage *_input; - float *_mean; - float *_stddev; - cl::Buffer *_global_sum; - cl::Buffer *_global_sum_squared; - BorderSize _border_size; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLMEANSTDDEVKERNEL_H */ diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp deleted file mode 100644 index 23a21d6b19..0000000000 --- a/src/core/CL/kernels/CLMedian3x3Kernel.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLMedian3x3Kernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -using namespace arm_compute; - -BorderSize CLMedian3x3Kernel::border_size() const -{ - return BorderSize(1); -} - -void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined); -} - -void CLMedian3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - - _input = input; - _output = output; - - // Create kernel - const std::string kernel_name = std::string("non_linear_filter_box3x3"); - _kernel = create_kernel(compile_context, kernel_name, { "-DMEDIAN" }); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_rows_read_per_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.h b/src/core/CL/kernels/CLMedian3x3Kernel.h deleted file mode 100644 index 8cc5ed7279..0000000000 --- a/src/core/CL/kernels/CLMedian3x3Kernel.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLMEDIAN3X3KERNEL_H -#define ARM_COMPUTE_CLMEDIAN3X3KERNEL_H - -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the median 3x3 filter kernel. - * - */ -class CLMedian3x3Kernel : public ICLSimple2DKernel -{ -public: - /** Initialise the kernel's input and output. - * - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined); - - // Inherited methods overridden: - BorderSize border_size() const override; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLMEDIAN3X3KERNEL_H */ diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp deleted file mode 100644 index 675cfc19a9..0000000000 --- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLMinMaxLocationKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include - -namespace arm_compute -{ -inline int32_t FloatFlip(float val) -{ - static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t"); - int32_t int_val = 0; - - memcpy(&int_val, &val, sizeof(float)); - int_val = (int_val >= 0) ? int_val : int_val ^ 0x7FFFFFFF; - return int_val; -} - -inline float IFloatFlip(int32_t val) -{ - static_assert(sizeof(float) == sizeof(int32_t), "Float must be same size as int32_t"); - float flt_val = 0.f; - - val = (val >= 0) ? val : val ^ 0x7FFFFFFF; - memcpy(&flt_val, &val, sizeof(float)); - return flt_val; -} - -CLMinMaxKernel::CLMinMaxKernel() - : _input(nullptr), _min_max(), _data_type_max_min() -{ -} - -void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, min_max); -} - -void CLMinMaxKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON(min_max == nullptr); - - _input = input; - _min_max = min_max; - const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0); - - switch(input->info()->data_type()) - { - case DataType::U8: - _data_type_max_min[0] = UCHAR_MAX; - _data_type_max_min[1] = 0; - break; - case DataType::S16: - _data_type_max_min[0] = SHRT_MAX; - _data_type_max_min[1] = SHRT_MIN; - break; - case DataType::F32: - _data_type_max_min[0] = FloatFlip(std::numeric_limits::max()); - _data_type_max_min[1] = FloatFlip(std::numeric_limits::lowest()); - break; - default: - ARM_COMPUTE_ERROR("You called with the wrong image data types"); - } - - // Set kernel build options - std::set build_opts{ "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) }; - - if(num_elems_processed_per_iteration % max_cl_vector_width != 0) - { - build_opts.emplace("-DNON_MULTIPLE_OF_16"); - } - - if(input->info()->data_type() == DataType::F32) - { - build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(std::numeric_limits::max())); - build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(std::numeric_limits::lowest())); - build_opts.emplace("-DIS_DATA_TYPE_FLOAT"); - } - else - { - build_opts.emplace("-DDATA_TYPE_MAX=" + support::cpp11::to_string(_data_type_max_min[0])); - build_opts.emplace("-DDATA_TYPE_MIN=" + support::cpp11::to_string(_data_type_max_min[1])); - } - - // Create kernel - _kernel = create_kernel(compile_context, "minmax", build_opts); - - // Set fixed arguments - unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, *_min_max); - _kernel.setArg(idx++, static_cast(input->info()->dimension(0))); - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, ceil_to_multiple(num_elems_processed_per_iteration, 16))); - ICLKernel::configure_internal(win); -} - -void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - // Reset mininum and maximum values - queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data()); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); - - cl_int min = 0; - cl_int max = 0; - queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 0 * sizeof(cl_int), sizeof(cl_int), static_cast(&min)); - queue.enqueueReadBuffer(*_min_max, CL_TRUE /* blocking */, 1 * sizeof(cl_int), sizeof(cl_int), static_cast(&max)); - - if(_input->info()->data_type() == DataType::F32) - { - std::array min_max = - { - { - IFloatFlip(min), - IFloatFlip(max) - } - }; - queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(float), min_max.data()); - } - else - { - std::array min_max = { { min, max } }; - queue.enqueueWriteBuffer(*_min_max, CL_TRUE /* blocking */, 0, min_max.size() * sizeof(int32_t), min_max.data()); - } -} - -CLMinMaxLocationKernel::CLMinMaxLocationKernel() - : _input(nullptr), _min_max_count(nullptr) -{ -} - -void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, min_max, min_max_count, min_loc, max_loc); -} - -void CLMinMaxLocationKernel::configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, - ICLCoordinates2DArray *max_loc) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32); - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON(min_max == nullptr); - ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr); - - _input = input; - _min_max_count = min_max_count; - - // Set kernel build options - std::set build_opts; - build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); - build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : ""); - build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : ""); - build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : ""); - if(input->info()->data_type() == DataType::F32) - { - build_opts.emplace("-DIS_DATA_TYPE_FLOAT"); - } - - // Create kernel - _kernel = create_kernel(compile_context, "minmaxloc", build_opts); - - // Set static arguments - unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, *min_max); - _kernel.setArg(idx++, *min_max_count); - if(min_loc != nullptr) - { - _kernel.setArg(idx++, min_loc->cl_buffer()); - _kernel.setArg(idx++, min_loc->max_num_values()); - } - if(max_loc != nullptr) - { - _kernel.setArg(idx++, max_loc->cl_buffer()); - _kernel.setArg(idx++, max_loc->max_num_values()); - } - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 1; - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); - update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); - ICLKernel::configure_internal(win); -} - -void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - static const unsigned int zero_count = 0; - queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count); - queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.h b/src/core/CL/kernels/CLMinMaxLocationKernel.h deleted file mode 100644 index 2196abe033..0000000000 --- a/src/core/CL/kernels/CLMinMaxLocationKernel.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H -#define ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H - -#include "arm_compute/core/CL/ICLArray.h" -#include "src/core/CL/ICLKernel.h" - -#include - -namespace arm_compute -{ -class ICLTensor; -using ICLImage = ICLTensor; - -/** Interface for the kernel to perform min max search on an image. - */ -class CLMinMaxKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLMinMaxKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMinMaxKernel(const CLMinMaxKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMinMaxKernel &operator=(const CLMinMaxKernel &) = delete; - /** Allow instances of this class to be moved */ - CLMinMaxKernel(CLMinMaxKernel &&) = default; - /** Allow instances of this class to be moved */ - CLMinMaxKernel &operator=(CLMinMaxKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input Image. Data types supported: U8/S16/F32. - * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32. - */ - void configure(const ICLImage *input, cl::Buffer *min_max); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input Image. Data types supported: U8/S16/F32. - * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32. - */ - void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; /**< Input image. */ - cl::Buffer *_min_max; /**< Minimum/maximum value. */ - std::array _data_type_max_min; /**< Maximum and minimum data type value respectively. */ -}; - -/** Interface for the kernel to find min max locations of an image. - */ -class CLMinMaxLocationKernel : public ICLKernel -{ -public: - /** Constructor */ - CLMinMaxLocationKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMinMaxLocationKernel(const CLMinMaxLocationKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMinMaxLocationKernel &operator=(const CLMinMaxLocationKernel &) = delete; - /** Allow instances of this class to be moved */ - CLMinMaxLocationKernel(CLMinMaxLocationKernel &&) = default; - /** Allow instances of this class to be moved */ - CLMinMaxLocationKernel &operator=(CLMinMaxLocationKernel &&) = default; - /** Initialise the kernel's input and outputs. - * - * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size. - * - * @param[in] input Input image. Data types supported: U8/S16/F32. - * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32. - * @param[out] min_max_count Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32 - * @param[out] min_loc (Optional) Array of Coordinates2D used to store minimum value locations. - * @param[out] max_loc (Optional) Array of Coordinates2D used to store maximum value locations. - */ - void configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, - ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr); - /** Initialise the kernel's input and outputs. - * - * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input image. Data types supported: U8/S16/F32. - * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32. - * @param[out] min_max_count Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32 - * @param[out] min_loc (Optional) Array of Coordinates2D used to store minimum value locations. - * @param[out] max_loc (Optional) Array of Coordinates2D used to store maximum value locations. - */ - void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, - ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLImage *_input; /**< Input image. */ - cl::Buffer *_min_max_count; /**< Minimum/maximum value occurrences. */ -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H */ diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp deleted file mode 100644 index c73acaf1d8..0000000000 --- a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLNonLinearFilterKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include -#include -#include -#include -#include - -using namespace arm_compute; - -CLNonLinearFilterKernel::CLNonLinearFilterKernel() - : _border_size(0) -{ -} - -BorderSize CLNonLinearFilterKernel::border_size() const -{ - return _border_size; -} - -void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, - unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, - bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_undefined); -} - -void CLNonLinearFilterKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, - unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, - bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5); - ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!"); - ARM_COMPUTE_UNUSED(mask); - - _input = input; - _output = output; - _border_size = BorderSize(mask_size / 2); - - // Define build options - std::set build_opts; - build_opts.emplace("-D" + string_from_non_linear_filter_function(function)); - - // Define kernel - std::string pattern_name = string_from_matrix_pattern(pattern); - std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower); - std::stringstream ss; - ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size; - - // Create kernel - _kernel = create_kernel(compile_context, ss.str(), build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - const unsigned int num_rows_read_per_iteration = mask_size; - - Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); -} diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.h b/src/core/CL/kernels/CLNonLinearFilterKernel.h deleted file mode 100644 index ed42063d2b..0000000000 --- a/src/core/CL/kernels/CLNonLinearFilterKernel.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H -#define ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/CL/ICLSimple2DKernel.h" - -#include - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the kernel to apply a non-linear filter */ -class CLNonLinearFilterKernel : public ICLSimple2DKernel -{ -public: - /** Default constructor */ - CLNonLinearFilterKernel(); - /** Set the source, destination and border mode of the kernel - * - * @param[in] input Source tensor. Data types supported: U8 - * @param[out] output Destination tensor. Data types supported: U8 - * @param[in] function Non linear function to perform - * @param[in] mask_size Mask size. Supported sizes: 3, 5 - * @param[in] pattern Mask pattern - * @param[in] mask The given mask. Will be used only if pattern is specified to PATTERN_OTHER - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, - unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, - bool border_undefined); - /** Set the source, destination and border mode of the kernel - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8 - * @param[out] output Destination tensor. Data types supported: U8 - * @param[in] function Non linear function to perform - * @param[in] mask_size Mask size. Supported sizes: 3, 5 - * @param[in] pattern Mask pattern - * @param[in] mask The given mask. Will be used only if pattern is specified to PATTERN_OTHER - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, - unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, - bool border_undefined); - - // Inherited methods overridden: - BorderSize border_size() const override; - -private: - BorderSize _border_size; /**< Border size */ -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H */ diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp deleted file mode 100644 index 7d5c5ba7e1..0000000000 --- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include - -using namespace arm_compute; - -BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const -{ - return BorderSize(1); -} - -void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_undefined); -} - -void CLNonMaximaSuppression3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32); - - _input = input; - _output = output; - - // Create kernel - std::set build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) }; - _kernel = create_kernel(compile_context, "non_max_suppression", build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_rows_read_per_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); -} diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h deleted file mode 100644 index d9ed60ce6b..0000000000 --- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H -#define ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H - -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface to perform Non-Maxima suppression over a 3x3 window using OpenCL - * - * @note Used by @ref CLFastCorners and @ref CLHarrisCorners - */ -class CLNonMaximaSuppression3x3Kernel : public ICLSimple2DKernel -{ -public: - /** Initialise the kernel's sources, destinations and border mode. - * - * @param[in] input Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor) - * @param[out] output Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor) - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined); - /** Initialise the kernel's sources, destinations and border mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor) - * @param[out] output Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor) - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined); - - // Inherited methods overridden: - BorderSize border_size() const override; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H */ diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp deleted file mode 100644 index 7ceddc9626..0000000000 --- a/src/core/CL/kernels/CLScharr3x3Kernel.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLScharr3x3Kernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include - -using namespace arm_compute; - -CLScharr3x3Kernel::CLScharr3x3Kernel() - : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr) -{ -} - -BorderSize CLScharr3x3Kernel::border_size() const -{ - return BorderSize(1); -} - -void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined); -} - -void CLScharr3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); - - _run_scharr_x = output_x != nullptr; - _run_scharr_y = output_y != nullptr; - - if(_run_scharr_x) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); - } - - if(_run_scharr_y) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); - } - - _input = input; - _output_x = output_x; - _output_y = output_y; - - // Set build options - std::set build_opts; - - if(_run_scharr_x) - { - build_opts.insert("-DGRAD_X"); - } - - if(_run_scharr_y) - { - build_opts.insert("-DGRAD_Y"); - } - - // Create kernel - _kernel = create_kernel(compile_context, "scharr3x3", build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_rows_read_per_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); - AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_x_access, output_y_access); - - output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); -} - -void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument_if((_run_scharr_x), idx, _output_x, slice); - add_2D_tensor_argument_if((_run_scharr_y), idx, _output_y, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.h b/src/core/CL/kernels/CLScharr3x3Kernel.h deleted file mode 100644 index a670da5b6f..0000000000 --- a/src/core/CL/kernels/CLScharr3x3Kernel.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLSCHARR3X3KERNEL_H -#define ARM_COMPUTE_CLSCHARR3X3KERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the kernel to run a 3x3 Scharr filter on a tensor. - * - * @f[ - * \mathbf{G}_x=\begin{vmatrix} - * -3 & 0 & +3\\ - * -10& 0 & +10\\ - * -3 & 0 & +3 - * \end{vmatrix} - * @f] - * @f[ - * \mathbf{G}_y=\begin{vmatrix} - * -3 & -10 & -3\\ - * 0 & 0 & 0\\ - * +3 & +10 & +3 - * \end{vmatrix} - * @f] - */ -class CLScharr3x3Kernel : public ICLKernel -{ -public: - /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */ - CLScharr3x3Kernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLScharr3x3Kernel(const CLScharr3x3Kernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLScharr3x3Kernel &operator=(const CLScharr3x3Kernel &) = delete; - /** Allow instances of this class to be moved */ - CLScharr3x3Kernel(CLScharr3x3Kernel &&) = default; - /** Allow instances of this class to be moved */ - CLScharr3x3Kernel &operator=(CLScharr3x3Kernel &&) = default; - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - bool _run_scharr_x; /**< Do we need to run Scharr X ? */ - bool _run_scharr_y; /**< Do we need to run Scharr Y ? */ - const ICLTensor *_input; /**< Input image */ - ICLTensor *_output_x; /**< Output image for scharr X */ - ICLTensor *_output_y; /**< Output image for scharr Y */ -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLSCHARR3X3KERNEL_H */ diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp deleted file mode 100644 index a87677a38f..0000000000 --- a/src/core/CL/kernels/CLSobel3x3Kernel.cpp +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLSobel3x3Kernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include - -using namespace arm_compute; - -CLSobel3x3Kernel::CLSobel3x3Kernel() - : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) -{ -} - -BorderSize CLSobel3x3Kernel::border_size() const -{ - return BorderSize(1); -} - -void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined); -} - -void CLSobel3x3Kernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); - - _run_sobel_x = output_x != nullptr; - _run_sobel_y = output_y != nullptr; - - if(_run_sobel_x) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); - } - - if(_run_sobel_y) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); - } - - _input = input; - _output_x = output_x; - _output_y = output_y; - - // Set build options - std::set build_opts; - - if(_run_sobel_x) - { - build_opts.insert("-DGRAD_X"); - } - - if(_run_sobel_y) - { - build_opts.insert("-DGRAD_Y"); - } - - // Create kernel - const std::string kernel_name = std::string("sobel3x3"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_rows_read_per_iteration = 3; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); - AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_x_access, output_y_access); - - output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} - -void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice); - add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.h b/src/core/CL/kernels/CLSobel3x3Kernel.h deleted file mode 100644 index fed8068762..0000000000 --- a/src/core/CL/kernels/CLSobel3x3Kernel.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLSOBEL3X3KERNEL_H -#define ARM_COMPUTE_CLSOBEL3X3KERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the kernel to run a 3x3 Sobel filter on a tensor. */ -class CLSobel3x3Kernel : public ICLKernel -{ -public: - /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */ - CLSobel3x3Kernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel3x3Kernel(const CLSobel3x3Kernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel3x3Kernel &operator=(const CLSobel3x3Kernel &) = delete; - /** Allow instances of this class to be moved */ - CLSobel3x3Kernel(CLSobel3x3Kernel &&) = default; - /** Allow instances of this class to be moved */ - CLSobel3x3Kernel &operator=(CLSobel3x3Kernel &&) = default; - /** Default destructor */ - ~CLSobel3x3Kernel() = default; - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input; /**< Input tensor */ - ICLTensor *_output_x; /**< Output tensor for Sobel X */ - ICLTensor *_output_y; /**< Output tensor for Sobel Y */ - bool _run_sobel_x; /**< Do we need to run Sobel X ? */ - bool _run_sobel_y; /**< Do we need to run Sobel Y ? */ -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLSOBEL3X3KERNEL_H */ diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp deleted file mode 100644 index c450becd1d..0000000000 --- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLSobel5x5Kernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include - -using namespace arm_compute; - -CLSobel5x5HorKernel::CLSobel5x5HorKernel() - : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0) -{ -} - -BorderSize CLSobel5x5HorKernel::border_size() const -{ - return _border_size; -} - -void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined); -} - -void CLSobel5x5HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); - - _run_sobel_x = output_x != nullptr; - _run_sobel_y = output_y != nullptr; - - if(_run_sobel_x) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); - } - - if(_run_sobel_y) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); - } - - _input = input; - _output_x = output_x; - _output_y = output_y; - _border_size = BorderSize(border_undefined ? 0 : 2, 2); - - // Set build options - std::set build_opts; - - if(_run_sobel_x) - { - build_opts.insert("-DGRAD_X"); - } - - if(_run_sobel_y) - { - build_opts.insert("-DGRAD_Y"); - } - - // Create kernel - const std::string kernel_name = std::string("sobel_separable1x5"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - - Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration); - AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); - AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_x_access, output_y_access); - - output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} - -void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice); - add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} - -CLSobel5x5VertKernel::CLSobel5x5VertKernel() - : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) -{ -} - -BorderSize CLSobel5x5VertKernel::border_size() const -{ - return BorderSize{ 2, 0 }; -} - -void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input_x, input_y, output_x, output_y, border_undefined); -} - -void CLSobel5x5VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); - - _run_sobel_x = output_x != nullptr; - _run_sobel_y = output_y != nullptr; - - if(_run_sobel_x) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); - } - - if(_run_sobel_y) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); - } - - _input_x = input_x; - _input_y = input_y; - _output_x = output_x; - _output_y = output_y; - - // Set build options - std::set build_opts; - - if(_run_sobel_x) - { - build_opts.insert("-DGRAD_X"); - } - - if(_run_sobel_y) - { - build_opts.insert("-DGRAD_Y"); - } - - // Create kernel - const std::string kernel_name = std::string("sobel_separable5x1"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - const ICLTensor *input = _run_sobel_x ? _input_x : _input_y; - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 8; - constexpr unsigned int num_rows_read_per_iteration = 5; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); - AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access); - - output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} - -void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument_if((_run_sobel_x), idx, _input_x, slice); - add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice); - add_2D_tensor_argument_if((_run_sobel_y), idx, _input_y, slice); - add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice); - - _kernel.setArg(idx++, 0 /*dummy*/); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.h b/src/core/CL/kernels/CLSobel5x5Kernel.h deleted file mode 100644 index a163ac932a..0000000000 --- a/src/core/CL/kernels/CLSobel5x5Kernel.h +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLSOBEL5X5KERNEL_H -#define ARM_COMPUTE_CLSOBEL5X5KERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor. */ -class CLSobel5x5HorKernel : public ICLKernel -{ -public: - /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */ - CLSobel5x5HorKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel5x5HorKernel(const CLSobel5x5HorKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel5x5HorKernel &operator=(const CLSobel5x5HorKernel &) = delete; - /** Allow instances of this class to be moved */ - CLSobel5x5HorKernel(CLSobel5x5HorKernel &&) = default; - /** Allow instances of this class to be moved */ - CLSobel5x5HorKernel &operator=(CLSobel5x5HorKernel &&) = default; - /** Default destructor */ - ~CLSobel5x5HorKernel() = default; - - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input; /**< Input tensor */ - ICLTensor *_output_x; /**< X output of horizontal pass */ - ICLTensor *_output_y; /**< Y output of horizontal pass */ - bool _run_sobel_x; /**< Do we need to run Sobel X ? */ - bool _run_sobel_y; /**< Do we need to run Sobel Y ? */ - BorderSize _border_size; /**< Border size */ -}; - -/** Interface for the kernel to run the vertical pass of 5x5 Sobel filter on a tensor. */ -class CLSobel5x5VertKernel : public ICLKernel -{ -public: - /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */ - CLSobel5x5VertKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel5x5VertKernel(const CLSobel5x5VertKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel5x5VertKernel &operator=(const CLSobel5x5VertKernel &) = delete; - /** Allow instances of this class to be moved */ - CLSobel5x5VertKernel(CLSobel5x5VertKernel &&) = default; - /** Allow instances of this class to be moved */ - CLSobel5x5VertKernel &operator=(CLSobel5x5VertKernel &&) = default; - /** Default destructor */ - ~CLSobel5x5VertKernel() = default; - - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set and the corresponding input. - * - * @param[in] input_x (Optional) Input for X (X output of horizontal pass). Data types supported: S16. - * @param[in] input_y (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set and the corresponding input. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input_x (Optional) Input for X (X output of horizontal pass). Data types supported: S16. - * @param[in] input_y (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S16. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input_x; /**< X input (X output of the horizontal pass) */ - const ICLTensor *_input_y; /**< Y input (Y output of the horizontal pass) */ - ICLTensor *_output_x; /**< X output of sobel */ - ICLTensor *_output_y; /**< Y output of sobel */ - bool _run_sobel_x; /**< Do we need to run sobel X? */ - bool _run_sobel_y; /**< Do we need to run sobel Y? */ -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLSOBEL5X5KERNEL_H */ diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp deleted file mode 100644 index 1cfa74f7b3..0000000000 --- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLSobel7x7Kernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include - -using namespace arm_compute; - -CLSobel7x7HorKernel::CLSobel7x7HorKernel() - : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0) -{ -} - -BorderSize CLSobel7x7HorKernel::border_size() const -{ - return _border_size; -} - -void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_undefined); -} - -void CLSobel7x7HorKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); - - _run_sobel_x = output_x != nullptr; - _run_sobel_y = output_y != nullptr; - - if(_run_sobel_x) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32); - } - - if(_run_sobel_y) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32); - } - - _input = input; - _output_x = output_x; - _output_y = output_y; - _border_size = BorderSize(border_undefined ? 0 : 3, 3); - - // Construct kernel name - const std::string kernel_name = "sobel_separable1x7"; - - // Set build options - std::set build_opts; - - if(_run_sobel_x) - { - build_opts.insert("-DGRAD_X"); - } - - if(_run_sobel_y) - { - build_opts.insert("-DGRAD_Y"); - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - - Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration); - AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); - AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_access, output_x_access, output_y_access); - - output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} - -void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice); - add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} - -CLSobel7x7VertKernel::CLSobel7x7VertKernel() - : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) -{ -} - -BorderSize CLSobel7x7VertKernel::border_size() const -{ - return BorderSize{ 3, 0 }; -} - -void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - configure(CLKernelLibrary::get().get_compile_context(), input_x, input_y, output_x, output_y, border_undefined); -} - -void CLSobel7x7VertKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); - - _run_sobel_x = output_x != nullptr; - _run_sobel_y = output_y != nullptr; - - if(_run_sobel_x) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32); - } - - if(_run_sobel_y) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32); - } - - _input_x = input_x; - _input_y = input_y; - _output_x = output_x; - _output_y = output_y; - - // Set build options - std::set build_opts; - - if(_run_sobel_x) - { - build_opts.insert("-DGRAD_X"); - } - - if(_run_sobel_y) - { - build_opts.insert("-DGRAD_Y"); - } - - // Create kernel - const std::string kernel_name = std::string("sobel_separable7x1"); - _kernel = create_kernel(compile_context, kernel_name, build_opts); - - const ICLTensor *input = _run_sobel_x ? _input_x : _input_y; - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_written_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 8; - constexpr unsigned int num_rows_read_per_iteration = 7; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - - AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); - AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); - AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access); - - output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(border_undefined); -} - -void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - - add_2D_tensor_argument_if((_run_sobel_x), idx, _input_x, slice); - add_2D_tensor_argument_if((_run_sobel_x), idx, _output_x, slice); - add_2D_tensor_argument_if((_run_sobel_y), idx, _input_y, slice); - add_2D_tensor_argument_if((_run_sobel_y), idx, _output_y, slice); - - _kernel.setArg(idx++, 0 /*dummy*/); - - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); -} diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.h b/src/core/CL/kernels/CLSobel7x7Kernel.h deleted file mode 100644 index c85f0aedf9..0000000000 --- a/src/core/CL/kernels/CLSobel7x7Kernel.h +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLSOBEL7X7KERNEL_H -#define ARM_COMPUTE_CLSOBEL7X7KERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor. */ -class CLSobel7x7HorKernel : public ICLKernel -{ -public: - /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */ - CLSobel7x7HorKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel7x7HorKernel(const CLSobel7x7HorKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel7x7HorKernel &operator=(const CLSobel7x7HorKernel &) = delete; - /** Allow instances of this class to be moved */ - CLSobel7x7HorKernel(CLSobel7x7HorKernel &&) = default; - /** Allow instances of this class to be moved */ - CLSobel7x7HorKernel &operator=(CLSobel7x7HorKernel &&) = default; - /** Default destructor */ - ~CLSobel7x7HorKernel() = default; - - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S32. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S32. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S32. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S32. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input; /**< Input tensor */ - ICLTensor *_output_x; /**< X output of horizontal pass */ - ICLTensor *_output_y; /**< Y output of horizontal pass */ - bool _run_sobel_x; /**< Do we need to run Sobel X ? */ - bool _run_sobel_y; /**< Do we need to run Sobel Y ? */ - BorderSize _border_size; /**< Border size */ -}; - -/** Interface for the kernel to run the vertical pass of 7x7 Sobel filter on a tensor. */ -class CLSobel7x7VertKernel : public ICLKernel -{ -public: - /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */ - CLSobel7x7VertKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel7x7VertKernel(const CLSobel7x7VertKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLSobel7x7VertKernel &operator=(const CLSobel7x7VertKernel &) = delete; - /** Allow instances of this class to be moved */ - CLSobel7x7VertKernel(CLSobel7x7VertKernel &&) = default; - /** Allow instances of this class to be moved */ - CLSobel7x7VertKernel &operator=(CLSobel7x7VertKernel &&) = default; - /** Default destructor */ - ~CLSobel7x7VertKernel() = default; - - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set and the corresponding input. - * - * @param[in] input_x (Optional) Input for X (X output of horizontal pass). Data types supported: S32. - * @param[in] input_y (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S32. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S32. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - /** Initialise the kernel's source, destination and border. - * - * @note At least one of output_x or output_y must be set and the corresponding input. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input_x (Optional) Input for X (X output of horizontal pass). Data types supported: S32. - * @param[in] input_y (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32. - * @param[out] output_x (Optional) Destination tensor for the X gradient, Data types supported: S32. - * @param[out] output_y (Optional) Destination tensor for the Y gradient, Data types supported: S32. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; - -private: - const ICLTensor *_input_x; /**< X input (X output of the horizontal pass) */ - const ICLTensor *_input_y; /**< Y input (Y output of the horizontal pass) */ - ICLTensor *_output_x; /**< X output of sobel */ - ICLTensor *_output_y; /**< Y output of sobel */ - bool _run_sobel_x; /**< Do we need to run sobel X? */ - bool _run_sobel_y; /**< Do we need to run sobel Y? */ -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLSOBEL7X7KERNEL_H */ diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp deleted file mode 100644 index b82f4c9889..0000000000 --- a/src/core/CL/kernels/CLTableLookupKernel.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLTableLookupKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLLut.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" - -#include -#include - -using namespace arm_compute; - -void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, lut, output); -} - -void CLTableLookupKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON(lut == nullptr); - ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - // Create kernel - std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8"; - _kernel = create_kernel(compile_context, kernel_name); - - // Set lut argument - unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, lut->cl_buffer()); - if(DataType::S16 == lut->type()) - { - _kernel.setArg(idx++, lut->index_offset()); - _kernel.setArg(idx++, static_cast(lut->num_elements())); - } - - // Configure kernel - constexpr unsigned int num_elems_processed_per_iteration = 8; - ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); -} diff --git a/src/core/CL/kernels/CLTableLookupKernel.h b/src/core/CL/kernels/CLTableLookupKernel.h deleted file mode 100644 index c8d15cbee2..0000000000 --- a/src/core/CL/kernels/CLTableLookupKernel.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLTABLELOOKUPKERNEL_H -#define ARM_COMPUTE_CLTABLELOOKUPKERNEL_H - -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; -class ICLLut; - -/** Interface for the kernel to perform table lookup calculations. */ -class CLTableLookupKernel : public ICLSimple2DKernel -{ -public: - /** Initialise the kernel's input, lut and output. - * - * @param[in] input An input tensor. Data types supported: U8, S16. - * @param[in] lut The input LUT. Data types supported: U8, S16. - * @param[out] output The output tensor. Data types supported: U8, S16. - */ - void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output); - /** Initialise the kernel's input, lut and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input An input tensor. Data types supported: U8, S16. - * @param[in] lut The input LUT. Data types supported: U8, S16. - * @param[out] output The output tensor. Data types supported: U8, S16. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output); -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_CLTABLELOOKUPKERNEL_H */ diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp deleted file mode 100644 index 72c22f043c..0000000000 --- a/src/core/CL/kernels/CLThresholdKernel.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLThresholdKernel.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include - -namespace arm_compute -{ -void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, info); -} - -void CLThresholdKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - - // Construct kernel name - std::string kernel_name = "threshold"; - - switch(info.type) - { - case ThresholdType::BINARY: - kernel_name += "_binary"; - break; - case ThresholdType::RANGE: - kernel_name += "_range"; - break; - default: - ARM_COMPUTE_ERROR("Thresholding type not recognized"); - break; - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name); - - // Set arguments - unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, info.false_value); - _kernel.setArg(idx++, info.true_value); - _kernel.setArg(idx++, info.threshold); - - if(ThresholdType::RANGE == info.type) - { - _kernel.setArg(idx++, info.upper); - } - - // Make sure _kernel is initialized before calling the parent's configure - constexpr unsigned int num_elems_processed_per_iteration = 16; - ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLThresholdKernel.h b/src/core/CL/kernels/CLThresholdKernel.h deleted file mode 100644 index 511eaed1bf..0000000000 --- a/src/core/CL/kernels/CLThresholdKernel.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLTHRESHOLDKERNEL_H -#define ARM_COMPUTE_CLTHRESHOLDKERNEL_H - -#include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/Types.h" -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -// Forward declarations -class ICLTensor; - -/** Interface for the thresholding kernel. */ -class CLThresholdKernel : public ICLSimple2DKernel -{ -public: - /**Initialise the kernel's input, output and threshold parameters. - * - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] info Threshold descriptor - */ - void configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info); - /**Initialise the kernel's input, output and threshold parameters. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input An input tensor. Data types supported: U8 - * @param[out] output The output tensor. Data types supported: U8. - * @param[in] info Threshold descriptor - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info); -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */ diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp deleted file mode 100644 index 600c67a528..0000000000 --- a/src/core/CL/kernels/CLWarpAffineKernel.cpp +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLWarpAffineKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include -#include -#include -#include - -namespace arm_compute -{ -namespace -{ -void options_add_matrix(std::set &options, const std::array &matrix) -{ - for(size_t i = 0; i < 6; ++i) - { - std::stringstream mat_str; - mat_str << "-DMAT" << i << "=" << matrix[i] << " "; - options.insert(mat_str.str()); - } -} -} // namespace - -BorderSize CLWarpAffineKernel::border_size() const -{ - return BorderSize(1); -} - -void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy); -} - -void CLWarpAffineKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy); - - _input = input; - _output = output; - - // Create build options - std::set options; - options_add_matrix(options, matrix); - options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - - // Create kernel - std::string interpolation_name = string_from_interpolation_policy(policy); - std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); - const std::string kernel_name = "warp_affine_" + interpolation_name; - _kernel = create_kernel(compile_context, kernel_name, options); - - // Set static kernel arguments - unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, input->info()->dimension(0)); - _kernel.setArg(idx++, input->info()->dimension(1)); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 4; - - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - int total_right = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration); - const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0); - - AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(input->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(3)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(3)); - _config_id += "_"; - _config_id += lower_string(string_from_interpolation_policy(policy)); -} -} // namespace arm_compute diff --git a/src/core/CL/kernels/CLWarpAffineKernel.h b/src/core/CL/kernels/CLWarpAffineKernel.h deleted file mode 100644 index c600ee780d..0000000000 --- a/src/core/CL/kernels/CLWarpAffineKernel.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLWARPAFFINEKERNEL_H -#define ARM_COMPUTE_CLWARPAFFINEKERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the warp affine kernel.*/ -class CLWarpAffineKernel : public ICLSimple2DKernel -{ -public: - /** Initialize the function's source, destination, interpolation policy and border_mode. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U8. - * @param[in] matrix The perspective matrix. Must be 2x3 of type float - * The matrix argument requires 9 values, the last 3 values are ignored. - * @param[in] policy The interpolation type. - */ - void configure(const ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy); - /** Initialize the function's source, destination, interpolation policy and border_mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U8. - * @param[in] matrix The perspective matrix. Must be 2x3 of type float - * The matrix argument requires 9 values, the last 3 values are ignored. - * @param[in] policy The interpolation type. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy); - - // Inherited methods overridden: - BorderSize border_size() const override; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLWARPAFFINEKERNEL_H */ diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp deleted file mode 100644 index 5f20a0bdd3..0000000000 --- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include -#include -#include - -using namespace arm_compute; - -namespace -{ -inline void options_add_matrix(std::set &options, const std::array &matrix) -{ - for(size_t i = 0; i < 9; ++i) - { - std::stringstream mat_str; - mat_str << "-DMAT" << i << "=" << matrix[i] << " "; - options.insert(mat_str.str()); - } -} -} // namespace - -BorderSize CLWarpPerspectiveKernel::border_size() const -{ - return BorderSize(1); -} - -void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy); -} - -void CLWarpPerspectiveKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy); - - _input = input; - _output = output; - - // Create build options - std::set options; - options_add_matrix(options, matrix); - options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); - - // Create kernel - std::string interpolation_name = string_from_interpolation_policy(policy); - std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); - std::string kernel_name = "warp_perspective_" + interpolation_name; - _kernel = create_kernel(compile_context, kernel_name, options); - - // Set static kernel arguments - unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - _kernel.setArg(idx++, input->info()->dimension(0)); - _kernel.setArg(idx++, input->info()->dimension(1)); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 4; - - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, input->info()->dimension(0) + border_size().right, input->info()->dimension(1) + border_size().bottom); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - ICLKernel::configure_internal(win); -} diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.h b/src/core/CL/kernels/CLWarpPerspectiveKernel.h deleted file mode 100644 index dcbe1c5560..0000000000 --- a/src/core/CL/kernels/CLWarpPerspectiveKernel.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H -#define ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H - -#include "arm_compute/core/Types.h" -#include "src/core/CL/ICLSimple2DKernel.h" - -namespace arm_compute -{ -class ICLTensor; -/** Interface for the warp perspective kernel.*/ -class CLWarpPerspectiveKernel : public ICLSimple2DKernel -{ -public: - /** Initialize the function's source, destination, interpolation policy and border_mode. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U8. - * @param[in] matrix The perspective matrix. Must be 3x3 of type float. - * @param[in] policy The interpolation type. - */ - void configure(const ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy); - /** Initialize the function's source, destination, interpolation policy and border_mode. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U8. - * @param[in] matrix The perspective matrix. Must be 3x3 of type float. - * @param[in] policy The interpolation type. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy); - - // Inherited methods overridden: - BorderSize border_size() const override; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H */ diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h index aea245c6fb..b2c5592960 100644 --- a/src/core/NEON/NEKernels.h +++ b/src/core/NEON/NEKernels.h @@ -35,7 +35,6 @@ #include "src/core/NEON/kernels/NECol2ImKernel.h" #include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h" #include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h" -#include "src/core/NEON/kernels/NEConvolutionKernel.h" #include "src/core/NEON/kernels/NECropKernel.h" #include "src/core/NEON/kernels/NECumulativeDistributionKernel.h" #include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h" @@ -72,7 +71,6 @@ #include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h" #include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" #include "src/core/NEON/kernels/NEMinMaxLayerKernel.h" -#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h" #include "src/core/NEON/kernels/NENormalizationLayerKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" #include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" @@ -83,6 +81,7 @@ #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" #include "src/core/NEON/kernels/NERangeKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" +#include "src/core/NEON/kernels/NERemapKernel.h" #include "src/core/NEON/kernels/NEReorgLayerKernel.h" #include "src/core/NEON/kernels/NEReverseKernel.h" #include "src/core/NEON/kernels/NEScaleKernel.h" diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp deleted file mode 100644 index 075de41203..0000000000 --- a/src/core/NEON/kernels/NEConvolutionKernel.cpp +++ /dev/null @@ -1,1625 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/NEON/kernels/NEConvolutionKernel.h" - -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include -#include -#include -#include -#include - -namespace arm_compute -{ -namespace -{ -const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX); - -inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output) -{ - const int16x8_t s16results = vcombine_s16(vqmovn_s32(out), - vqmovn_s32(out2)); - vst1q_s16(output, s16results); -} - -inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output) -{ - const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out), - vqmovun_s32(out2))); - vst1_u8(output, u8results); -} - -inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output) -{ - const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2)); - const int16x8_t s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16)); - vst1q_s16(output, s16results); -} - -inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output) -{ - const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out), - vqmovn_u32(out2))); - vst1_u8(output, u8results); -} - -inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output) -{ - vst1q_s16(output, out); - vst1q_s16(output + 8, out2); -} - -inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output) -{ - const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out), - vqmovun_s16(out2)); - vst1q_u8(output, u8results); -} - -inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output) -{ - const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out), - vqmovn_u16(out2)); - vst1q_u8(output, u8results); -} - -inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output) -{ - vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16))); - vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16))); -} - -inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2) -{ - // Convert to s16 and split in blocks of 4 values: - const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data))); - const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data))); - - const int16x4x3_t row = - { - { - vget_low_s16(s16_tmp0), - vget_high_s16(s16_tmp0), - vget_low_s16(s16_tmp1) - } - }; - - // Calculate row left value for pixels [0,3] - out = vmlal_s16(out, row.val[0], mat0); - // Calculate row middle value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1); - // Calculate row right value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2); - - // Calculate row left value for pixels [4,7] - out2 = vmlal_s16(out2, row.val[1], mat0); - // Calculate row middle value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1); - // Calculate row right value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2); -} - -inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution) -{ - const int16x4_t mat0 = vld1_dup_s16(convolution); - const int16x4_t mat1 = vld1_dup_s16(convolution + 1); - const int16x4_t mat2 = vld1_dup_s16(convolution + 2); - - convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2); -} - -inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution) -{ - const int16x4_t mat0 = vld1_dup_s16(convolution); - const int16x4_t mat1 = vld1_dup_s16(convolution + 1); - const int16x4_t mat2 = vld1_dup_s16(convolution + 2); - const int16x4_t mat3 = vld1_dup_s16(convolution + 3); - const int16x4_t mat4 = vld1_dup_s16(convolution + 4); - - // Convert to s16 and split in blocks of 4 values: - const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data))); - const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data))); - - const int16x4x3_t row = - { - { - vget_low_s16(s16_tmp0), - vget_high_s16(s16_tmp0), - vget_low_s16(s16_tmp1) - } - }; - - // Calculate row left 2 value for pixels [0,3] - out = vmlal_s16(out, row.val[0], mat0); - // Calculate row left 1 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1); - // Calculate row middle value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2); - // Calculate row right +1 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3); - // Calculate row right +2 value for pixels [0,3] - out = vmlal_s16(out, row.val[1], mat4); - - // Calculate row left 2 value for pixels [4,7] - out2 = vmlal_s16(out2, row.val[1], mat0); - // Calculate row left 1 value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1); - // Calculate row middle value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2); - // Calculate row right +1 value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3); - // Calculate row right +2 value for pixels [4,7] - out2 = vmlal_s16(out2, row.val[2], mat4); -} - -inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution) -{ - const int16x4_t mat0 = vld1_dup_s16(convolution); - const int16x4_t mat1 = vld1_dup_s16(convolution + 1); - const int16x4_t mat2 = vld1_dup_s16(convolution + 2); - const int16x4_t mat3 = vld1_dup_s16(convolution + 3); - const int16x4_t mat4 = vld1_dup_s16(convolution + 4); - const int16x4_t mat5 = vld1_dup_s16(convolution + 5); - const int16x4_t mat6 = vld1_dup_s16(convolution + 6); - - // Convert to s16 and split in blocks of 4 values: - const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data))); - const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data))); - - const int16x4x4_t row = - { - { - vget_low_s16(s16_tmp0), - vget_high_s16(s16_tmp0), - vget_low_s16(s16_tmp1), - vget_high_s16(s16_tmp1) - } - }; - - // Calculate row left 3 value for pixels [0,3] - out = vmlal_s16(out, row.val[0], mat0); - // Calculate row left 2 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1); - // Calculate row left 1 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2); - // Calculate row middle value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3); - // Calculate row right +1 value for pixels [0,3] - out = vmlal_s16(out, row.val[1], mat4); - // Calculate row right +2 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5); - // Calculate row right +3 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6); - - // Calculate row left 3 value for pixels [4,7] - out2 = vmlal_s16(out2, row.val[1], mat0); - // Calculate row left 2 value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1); - // Calculate row left 1 value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2); - // Calculate row middle value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3); - // Calculate row right +1 value for pixels [4,7] - out2 = vmlal_s16(out2, row.val[2], mat4); - // Calculate row right +2 value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5); - // Calculate row right +3 value for pixels [4,7] - out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6); -} - -inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution) -{ - const int16x4_t mat0 = vld1_dup_s16(convolution); - const int16x4_t mat1 = vld1_dup_s16(convolution + 1); - const int16x4_t mat2 = vld1_dup_s16(convolution + 2); - const int16x4_t mat3 = vld1_dup_s16(convolution + 3); - const int16x4_t mat4 = vld1_dup_s16(convolution + 4); - const int16x4_t mat5 = vld1_dup_s16(convolution + 5); - const int16x4_t mat6 = vld1_dup_s16(convolution + 6); - const int16x4_t mat7 = vld1_dup_s16(convolution + 7); - const int16x4_t mat8 = vld1_dup_s16(convolution + 8); - - // Convert to s16 and split in blocks of 4 values: - const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data))); - const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data))); - - const int16x4x4_t row = - { - { - vget_low_s16(s16_tmp0), - vget_high_s16(s16_tmp0), - vget_low_s16(s16_tmp1), - vget_high_s16(s16_tmp1) - } - }; - - // Calculate row left 4 value for pixels [0,3] - out = vmlal_s16(out, row.val[0], mat0); - // Calculate row left 3 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1); - // Calculate row left 2 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2); - // Calculate row left 1 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3); - // Calculate row middle value for pixels [0,3] - out = vmlal_s16(out, row.val[1], mat4); - // Calculate row right +1 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5); - // Calculate row right +2 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6); - // Calculate row right +3 value for pixels [0,3] - out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7); - // Calculate row right +4 value for pixels [0,3] - out = vmlal_s16(out, row.val[2], mat8); - - // Calculate row left 4 value for pixels [0,3] - out2 = vmlal_s16(out2, row.val[1], mat0); - // Calculate row left 3 value for pixels [0,3] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1); - // Calculate row left 2 value for pixels [0,3] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2); - // Calculate row left 1 value for pixels [0,3] - out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3); - // Calculate row middle value for pixels [0,3] - out2 = vmlal_s16(out2, row.val[2], mat4); - // Calculate row right +1 value for pixels [0,3] - out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5); - // Calculate row right +2 value for pixels [0,3] - out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6); - // Calculate row right +3 value for pixels [0,3] - out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7); - // Calculate row right +4 value for pixels [0,3] - out2 = vmlal_s16(out2, row.val[3], mat8); -} -} // namespace - -/****************************************************************************************\ - * Square Convolution * -\****************************************************************************************/ - -template -NEConvolutionKernel::NEConvolutionKernel() - : INESimpleKernel(), _scale(0), _convolution{ {} } -{ -} - -template -BorderSize NEConvolutionKernel::border_size() const -{ - return BorderSize{ matrix_size / 2 }; -} - -template -void NEConvolutionKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv); - - set_shape_if_empty(*output->info(), input->info()->tensor_shape()); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - - _input = input; - _output = output; - - std::copy_n(conv, _convolution.size(), _convolution.begin()); - - if(scale == 0) - { - _scale = calculate_matrix_scale(_convolution.data(), matrix_size); - } - else - { - _scale = scale; - } - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, - AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size), - output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - INEKernel::configure(win); -} - -template <> -template -void NEConvolutionKernel<3>::convolution(const Window &win) -{ - static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); - ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); - - Iterator input(_input, win); - Iterator output(_output, win); - - // Load the matrix's coefficients into Neon registers: - const int16x4_t mat00 = vld1_dup_s16(_convolution.data()); - const int16x4_t mat01 = vld1_dup_s16(_convolution.data() + 1); - const int16x4_t mat02 = vld1_dup_s16(_convolution.data() + 2); - const int16x4_t mat10 = vld1_dup_s16(_convolution.data() + 3); - const int16x4_t mat11 = vld1_dup_s16(_convolution.data() + 4); - const int16x4_t mat12 = vld1_dup_s16(_convolution.data() + 5); - const int16x4_t mat20 = vld1_dup_s16(_convolution.data() + 6); - const int16x4_t mat21 = vld1_dup_s16(_convolution.data() + 7); - const int16x4_t mat22 = vld1_dup_s16(_convolution.data() + 8); - const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); - - const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1)); - const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0)); - const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1)); - - execute_window_loop(win, [&](const Coordinates &) - { - int32x4_t out = vdupq_n_s32(0); - int32x4_t out2 = vdupq_n_s32(0); - - // Load 16 bytes from the top row: - const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); - convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02); - - // Load 16 bytes from the middle row: - const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); - convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12); - - // Load 16 bytes from the middle row: - const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset()); - convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22); - - // Apply scale - if(_scale != 1) - { - // Convert to F32, scale and convert back to S32 - out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); - out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); - } - - // Clamp and store as U8 or S16: - store_results(out, out2, reinterpret_cast(output.ptr())); - }, - input, output); -} - -template <> -template -void NEConvolutionKernel<5>::convolution(const Window &win) -{ - static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); - ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); - - Iterator input(_input, win); - Iterator output(_output, win); - - const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); - - const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2)); - const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1)); - const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0)); - const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1)); - const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2)); - - execute_window_loop(win, [&](const Coordinates &) - { - int32x4_t out = vdupq_n_s32(0); - int32x4_t out2 = vdupq_n_s32(0); - - // Load 16 bytes from the top2 row: - const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset()); - convolve_row5x1(out, out2, data_t2, _convolution.data()); - - // Load 16 bytes from the top1 row: - const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset()); - convolve_row5x1(out, out2, data_t1, _convolution.data() + 5); - - // Load 16 bytes from the middle row: - const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset()); - convolve_row5x1(out, out2, data_m, _convolution.data() + 10); - - // Load 16 bytes from the low1 row: - const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset()); - convolve_row5x1(out, out2, data_b1, _convolution.data() + 15); - - // Load 16 bytes from the low2 row: - const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset()); - convolve_row5x1(out, out2, data_b2, _convolution.data() + 20); - - // Apply scale - if(_scale != 1) - { - // Convert to F32, scale and convert back to S32 - out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); - out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); - } - - // Clamp and store as U8 or S16: - store_results(out, out2, reinterpret_cast(output.ptr())); - }, - input, output); -} - -template <> -template -void NEConvolutionKernel<7>::convolution(const Window &win) -{ - static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); - ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); - - Iterator input(_input, win); - Iterator output(_output, win); - - const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); - - const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3)); - const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2)); - const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1)); - const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0)); - const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1)); - const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2)); - const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3)); - - execute_window_loop(win, [&](const Coordinates &) - { - int32x4_t out = vdupq_n_s32(0); - int32x4_t out2 = vdupq_n_s32(0); - - // Load 16 bytes from the top3 row: - const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset()); - convolve_row7x1(out, out2, data_t3, _convolution.data()); - - // Load 16 bytes from the top2 row: - const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset()); - convolve_row7x1(out, out2, data_t2, _convolution.data() + 7); - - // Load 16 bytes from the top1 row: - const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset()); - convolve_row7x1(out, out2, data_t1, _convolution.data() + 14); - - // Load 16 bytes from the middle row: - const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset()); - convolve_row7x1(out, out2, data_m, _convolution.data() + 21); - - // Load 16 bytes from the low1 row: - const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset()); - convolve_row7x1(out, out2, data_b1, _convolution.data() + 28); - - // Load 16 bytes from the low2 row: - const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset()); - convolve_row7x1(out, out2, data_b2, _convolution.data() + 35); - - // Load 16 bytes from the low3 row: - const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset()); - convolve_row7x1(out, out2, data_b3, _convolution.data() + 42); - - // Apply scale - if(_scale != 1) - { - // Convert to F32, scale and convert back to S32 - out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); - out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); - } - - // Clamp and store as U8 or S16: - store_results(out, out2, reinterpret_cast(output.ptr())); - }, - input, output); -} - -template <> -template -void NEConvolutionKernel<9>::convolution(const Window &win) -{ - static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); - ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); - - Iterator input(_input, win); - Iterator output(_output, win); - - const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); - - const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4)); - const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3)); - const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2)); - const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1)); - const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0)); - const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1)); - const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2)); - const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3)); - const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4)); - - execute_window_loop(win, [&](const Coordinates &) - { - int32x4_t out = vdupq_n_s32(0); - int32x4_t out2 = vdupq_n_s32(0); - - // Load 16 bytes from the top4 row: - const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset()); - convolve_row9x1(out, out2, data_t4, _convolution.data()); - - // Load 16 bytes from the top3 row: - const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset()); - convolve_row9x1(out, out2, data_t3, _convolution.data() + 9); - - // Load 16 bytes from the top2 row: - const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset()); - convolve_row9x1(out, out2, data_t2, _convolution.data() + 18); - - // Load 16 bytes from the top1 row: - const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset()); - convolve_row9x1(out, out2, data_t1, _convolution.data() + 27); - - // Load 16 bytes from the middle row: - const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset()); - convolve_row9x1(out, out2, data_m, _convolution.data() + 36); - - // Load 16 bytes from the low1 row: - const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset()); - convolve_row9x1(out, out2, data_b1, _convolution.data() + 45); - - // Load 16 bytes from the low2 row: - const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset()); - convolve_row9x1(out, out2, data_b2, _convolution.data() + 54); - - // Load 16 bytes from the low3 row: - const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset()); - convolve_row9x1(out, out2, data_b3, _convolution.data() + 63); - - // Load 16 bytes from the low4 row: - const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset()); - convolve_row9x1(out, out2, data_b4, _convolution.data() + 72); - - // Apply scale - if(_scale != 1) - { - // Convert to F32, scale and convert back to S32 - out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); - out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); - } - - // Clamp and store as U8 or S16: - store_results(out, out2, reinterpret_cast(output.ptr())); - }, - input, output); -} - -template -void NEConvolutionKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - switch(_output->info()->data_type()) - { - case DataType::U8: - convolution(window); - break; - case DataType::S16: - convolution(window); - break; - default: - ARM_COMPUTE_ERROR("Not supported Data type!"); - break; - } -} - -template class arm_compute::NEConvolutionKernel<3>; -template class arm_compute::NEConvolutionKernel<5>; -template class arm_compute::NEConvolutionKernel<7>; -template class arm_compute::NEConvolutionKernel<9>; - -/****************************************************************************************\ - * Separable Square Convolution * -\****************************************************************************************/ - -template -NESeparableConvolutionHorKernel::NESeparableConvolutionHorKernel() - : _conv_row{ { 0 } }, _border_size(0) -{ -} - -template -BorderSize NESeparableConvolutionHorKernel::border_size() const -{ - return _border_size; -} - -template -void NESeparableConvolutionHorKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row); - - set_shape_if_empty(*output->info(), input->info()->tensor_shape()); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32); - - _input = input; - _output = output; - std::copy_n(conv_row, _conv_row.size(), _conv_row.begin()); - _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - - Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, - AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration), - output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - INEKernel::configure(win); -} - -template -void NESeparableConvolutionHorKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_output->info()->data_type()) - { - case DataType::U16: - convolve(window); - break; - case DataType::S16: - convolve(window); - break; - case DataType::S32: - convolve(window); - break; - default: - ARM_COMPUTE_ERROR("Unsupported intermediate data type!"); - break; - } -} - -template <> -template <> -inline void NESeparableConvolutionHorKernel<5>::convolve(const Window &window) -{ - Window win_in(window); - win_in.shift(Window::DimX, -2); - - Iterator input(_input, win_in); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x16_t data = vld1q_u8(input.ptr()); - - const uint16x8x2_t data_u16 = - { - { - vmovl_u8(vget_low_u8(data)), - vmovl_u8(vget_high_u8(data)) - } - }; - - uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]); - - vst1q_u16(reinterpret_cast(output.ptr()), out); - }, - input, output); -} - -template <> -template <> -inline void NESeparableConvolutionHorKernel<5>::convolve(const Window &window) -{ - Window win_in(window); - win_in.shift(Window::DimX, -2); - - Iterator input(_input, win_in); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x16_t data = vld1q_u8(input.ptr()); - - const int16x8x2_t data_s16 = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) - } - }; - - int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]); - - vst1q_s16(reinterpret_cast(output.ptr()), out); - }, - input, output); -} - -template <> -template <> -void NESeparableConvolutionHorKernel<5>::convolve(const Window &window) -{ - Window win_in(window); - win_in.shift(Window::DimX, -2); - - Iterator input(_input, win_in); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x16_t data = vld1q_u8(input.ptr()); - - const int16x8x2_t data_s16 = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) - } - }; - - const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1); - const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 2); - const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3); - const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4); - - int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]); - - vst1q_s32(reinterpret_cast(output.ptr()), out_low); - - int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]); - - vst1q_s32(reinterpret_cast(output.ptr()) + 4, out_high); - }, - input, output); -} - -template <> -template <> -inline void NESeparableConvolutionHorKernel<7>::convolve(const Window &window) -{ - Window win_in(window); - win_in.shift(Window::DimX, -3); - - Iterator input(_input, win_in); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x16_t data = vld1q_u8(input.ptr()); - - const uint16x8x2_t data_u16 = - { - { - vmovl_u8(vget_low_u8(data)), - vmovl_u8(vget_high_u8(data)) - } - }; - - uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]); - - vst1q_u16(reinterpret_cast(output.ptr()), out); - }, - input, output); -} - -template <> -template <> -inline void NESeparableConvolutionHorKernel<7>::convolve(const Window &window) -{ - Window win_in(window); - win_in.shift(Window::DimX, -3); - - Iterator input(_input, win_in); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x16_t data = vld1q_u8(input.ptr()); - - const int16x8x2_t data_s16 = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) - } - }; - - int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]); - - vst1q_s16(reinterpret_cast(output.ptr()), out); - }, - input, output); -} - -template <> -template <> -void NESeparableConvolutionHorKernel<7>::convolve(const Window &window) -{ - Window win_in(window); - win_in.shift(Window::DimX, -3); - - Iterator input(_input, win_in); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x16_t data = vld1q_u8(input.ptr()); - - const int16x8x2_t data_s16 = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) - } - }; - - const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1); - const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2); - const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 3); - const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4); - const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5); - const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6); - - int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]); - - vst1q_s32(reinterpret_cast(output.ptr()), out_low); - - int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]); - - vst1q_s32(reinterpret_cast(output.ptr()) + 4, out_high); - }, - input, output); -} - -template <> -template <> -inline void NESeparableConvolutionHorKernel<9>::convolve(const Window &window) -{ - Window win_in(window); - win_in.shift(Window::DimX, -4); - - Iterator input(_input, win_in); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x16_t data = vld1q_u8(input.ptr()); - - const uint16x8x2_t data_u16 = - { - { - vmovl_u8(vget_low_u8(data)), - vmovl_u8(vget_high_u8(data)) - } - }; - - uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]); - out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]); - out = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]); - - vst1q_u16(reinterpret_cast(output.ptr()), out); - }, - input, output); -} - -template <> -template <> -inline void NESeparableConvolutionHorKernel<9>::convolve(const Window &window) -{ - Window win_in(window); - win_in.shift(Window::DimX, -4); - - Iterator input(_input, win_in); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x16_t data = vld1q_u8(input.ptr()); - - const int16x8x2_t data_s16 = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) - } - }; - - int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]); - out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]); - out = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]); - - vst1q_s16(reinterpret_cast(output.ptr()), out); - }, - input, output); -} - -template <> -template <> -void NESeparableConvolutionHorKernel<9>::convolve(const Window &window) -{ - Window win_in(window); - win_in.shift(Window::DimX, -4); - - Iterator input(_input, win_in); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates &) - { - const uint8x16_t data = vld1q_u8(input.ptr()); - - const int16x8x2_t data_s16 = - { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) - } - }; - - const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1); - const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2); - const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3); - const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 4); - const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5); - const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6); - const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7); - - int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]); - out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]); - - vst1q_s32(reinterpret_cast(output.ptr()), out_low); - - int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]); - out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]); - - vst1q_s32(reinterpret_cast(output.ptr()) + 4, out_high); - }, - input, output); -} - -template class arm_compute::NESeparableConvolutionHorKernel<5>; -template class arm_compute::NESeparableConvolutionHorKernel<7>; -template class arm_compute::NESeparableConvolutionHorKernel<9>; - -template -NESeparableConvolutionVertKernel::NESeparableConvolutionVertKernel() - : _conv_col{ { 0 } }, _scale(0) -{ -} - -template -BorderSize NESeparableConvolutionVertKernel::border_size() const -{ - return BorderSize{ matrix_size / 2, 0 }; -} - -template -void NESeparableConvolutionVertKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col); - - set_shape_if_empty(*output->info(), input->info()->tensor_shape()); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON(scale == 0); - - _input = input; - _output = output; - std::copy_n(conv_col, _conv_col.size(), _conv_col.begin()); - _scale = scale; - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 16; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 16; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, - AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size), - output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - INEKernel::configure(win); -} - -template -void NESeparableConvolutionVertKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - switch(_input->info()->data_type()) - { - case DataType::U16: - switch(_output->info()->data_type()) - { - case DataType::U8: - convolution_u16(window); - break; - case DataType::S16: - convolution_u16(window); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } - break; - case DataType::S16: - switch(_output->info()->data_type()) - { - case DataType::U8: - convolution_s16(window); - break; - case DataType::S16: - convolution_s16(window); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } - break; - case DataType::S32: - switch(_output->info()->data_type()) - { - case DataType::U8: - convolution_s32(window); - break; - case DataType::S16: - convolution_s32(window); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } - break; - default: - ARM_COMPUTE_ERROR("Unsupported intermediate data type!"); - break; - } -} - -template -template -void NESeparableConvolutionVertKernel::convolution_u16(const Window &win) -{ - static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); - - Window win_in(win); - win_in.set_dimension_step(Window::DimX, 8); - - Iterator in(_input, win_in); - Iterator out(_output, win); - - std::array input_ptrs{ {} }; - const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale); - const int k_half = matrix_size / 2; - - // Set row pointers - for(int i = -k_half; i <= k_half; ++i) - { - input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i)); - } - - execute_window_loop(win, [&](const Coordinates &) - { - uint16x8_t out0 = vdupq_n_u16(0); - uint16x8_t out1 = vdupq_n_u16(0); - - // First half - for(unsigned int r = 0; r < matrix_size; ++r) - { - const uint16x8_t data = vld1q_u16(reinterpret_cast(input_ptrs[r] + in.offset())); - out0 = vmlaq_n_u16(out0, data, _conv_col[r]); - } - - in.increment(Window::DimX); - - // Second half - for(unsigned int r = 0; r < matrix_size; ++r) - { - const uint16x8_t data = vld1q_u16(reinterpret_cast(input_ptrs[r] + in.offset())); - out1 = vmlaq_n_u16(out1, data, _conv_col[r]); - } - - //scale the result if needed - if(_scale != 1) - { - float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0))); - float32x4_t out0_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0))); - out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale); - out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale); - store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast(out.ptr())); - - float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1))); - float32x4_t out1_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1))); - out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale); - out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale); - store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast(out.ptr()) + 8); - } - else - { - store_results(out0, out1, reinterpret_cast(out.ptr())); - } - }, - in, out); -} - -template -template -void NESeparableConvolutionVertKernel::convolution_s16(const Window &win) -{ - static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); - - Window win_in(win); - win_in.set_dimension_step(Window::DimX, 8); - - Iterator in(_input, win_in); - Iterator out(_output, win); - - std::array input_ptrs{ {} }; - const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale); - const int k_half = matrix_size / 2; - - // Set row pointers - for(int i = -k_half; i <= k_half; ++i) - { - input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i)); - } - - execute_window_loop(win, [&](const Coordinates &) - { - int16x8_t out0 = vdupq_n_s16(0); - int16x8_t out1 = vdupq_n_s16(0); - - // First half - for(unsigned int r = 0; r < matrix_size; ++r) - { - const int16x8_t data = vld1q_s16(reinterpret_cast(input_ptrs[r] + in.offset())); - out0 = vmlaq_n_s16(out0, data, _conv_col[r]); - } - - in.increment(Window::DimX); - - // Second half - for(unsigned int r = 0; r < matrix_size; ++r) - { - const int16x8_t data = vld1q_s16(reinterpret_cast(input_ptrs[r] + in.offset())); - out1 = vmlaq_n_s16(out1, data, _conv_col[r]); - } - - //scale the result if needed - if(_scale != 1) - { - float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0))); - float32x4_t out0_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0))); - out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale); - out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale); - store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast(out.ptr())); - - float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1))); - float32x4_t out1_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1))); - out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale); - out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale); - store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast(out.ptr()) + 8); - } - else - { - store_results(out0, out1, reinterpret_cast(out.ptr())); - } - }, - in, out); -} - -template -template -void NESeparableConvolutionVertKernel::convolution_s32(const Window &win) -{ - static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); - - Window win_in(win); - win_in.set_dimension_step(Window::DimX, 8); - - Iterator in(_input, win_in); - Iterator out(_output, win); - - std::array input_ptrs{ {} }; - const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale); - const int k_half = matrix_size / 2; - - // Set row pointers - for(int i = -k_half; i <= k_half; ++i) - { - input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i)); - } - - const int32x4_t zero = vdupq_n_s32(0); - - execute_window_loop(win, [&](const Coordinates &) - { - int32x4x2_t out0 = - { - { - zero, - zero - } - }; - - int32x4x2_t out1 = - { - { - zero, - zero - } - }; - - // First half - for(unsigned int r = 0; r < matrix_size; ++r) - { - const int32x4x2_t data = vld2q_s32(reinterpret_cast(input_ptrs[r] + in.offset())); - out0.val[0] = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]); - out0.val[1] = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]); - } - - in.increment(Window::DimX); - - // Second half - for(unsigned int r = 0; r < matrix_size; ++r) - { - const int32x4x2_t data = vld2q_s32(reinterpret_cast(input_ptrs[r] + in.offset())); - out1.val[0] = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]); - out1.val[1] = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]); - } - - //scale the result if needed - if(_scale != 1) - { - float32x4_t out0_f32_odd = vcvtq_f32_s32(out0.val[0]); - float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]); - out0_f32_odd = vmulq_f32(out0_f32_odd, oneoverscale); - out0_f32_even = vmulq_f32(out0_f32_even, oneoverscale); - out0.val[0] = vcvtq_s32_f32(out0_f32_odd); - out0.val[1] = vcvtq_s32_f32(out0_f32_even); - - float32x4_t out1_f32_odd = vcvtq_f32_s32(out1.val[0]); - float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]); - out1_f32_odd = vmulq_f32(out1_f32_odd, oneoverscale); - out1_f32_even = vmulq_f32(out1_f32_even, oneoverscale); - out1.val[0] = vcvtq_s32_f32(out1_f32_odd); - out1.val[1] = vcvtq_s32_f32(out1_f32_even); - } - - const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]); - store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast(out.ptr())); - - const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]); - store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast(out.ptr()) + 8); - }, - in, out); -} - -template class arm_compute::NESeparableConvolutionVertKernel<5>; -template class arm_compute::NESeparableConvolutionVertKernel<7>; -template class arm_compute::NESeparableConvolutionVertKernel<9>; - -/****************************************************************************************\ - * Rectangle Convolution * -\****************************************************************************************/ - -NEConvolutionRectangleKernel::NEConvolutionRectangleKernel() - : _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0) -{ -} - -BorderSize NEConvolutionRectangleKernel::border_size() const -{ - return _border_size; -} - -void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv); - - set_shape_if_empty(*output->info(), input->info()->tensor_shape()); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9); - ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9); - ARM_COMPUTE_ERROR_ON(0 == scale); - - _input = input; - _output = output; - _scale = scale; - _border_size = BorderSize(height / 2, width / 2); - - // Setup the convolution matrix - const uint32_t nr_elements = width * height; - _convolution.resize(nr_elements); - std::copy_n(conv, nr_elements, _convolution.begin()); - - // Set function index to help choose appropriate function in run() - _func_idx = get_index(height) * 4 + get_index(width); - ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes)); - - // Configure kernel window - constexpr unsigned int num_elems_processed_per_iteration = 8; - constexpr unsigned int num_elems_read_per_iteration = 16; - constexpr unsigned int num_elems_written_per_iteration = 8; - - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, - AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height), - output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size); - - INEKernel::configure(win); -} - -void NEConvolutionRectangleKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window); - - // uint8_t function table - static const std::array func_table_u8 = - { - { - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution - } - }; - // int16_t function table - static const std::array func_table_s16 = - { - { - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution, - &NEConvolutionRectangleKernel::convolution - } - }; - - // Run appropriate function - switch(_output->info()->data_type()) - { - case DataType::U8: - ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size()); - (this->*func_table_u8[_func_idx])(window); - break; - case DataType::S16: - ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size()); - (this->*func_table_s16[_func_idx])(window); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } -} - -unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val) -{ - switch(val) - { - case 3: - return 0; - case 5: - return 1; - case 7: - return 2; - case 9: - return 3; - default: - ARM_COMPUTE_ERROR("Not supported dimension size"); - return 0; - } -} - -template -void NEConvolutionRectangleKernel::convolution(const Window &win) -{ - static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); - ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); - - Iterator input(_input, win); - Iterator output(_output, win); - - std::array input_ptrs{ {} }; - const int16_t *conv = _convolution.data(); - const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); - const int k_row_half = rows / 2; - const int k_col_half = cols / 2; - - // Set row pointers - for(int i = -k_row_half; i <= k_row_half; ++i) - { - input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i)); - } - - execute_window_loop(win, [&](const Coordinates &) - { - int32x4_t out = vdupq_n_s32(0); - int32x4_t out2 = vdupq_n_s32(0); - - // Perform appropriate convolution - for(unsigned int r = 0; r < rows; ++r) - { - const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset()); - if(3 == cols) - { - convolve_row3x1(out, out2, data, conv + r * cols); - } - else if(5 == cols) - { - convolve_row5x1(out, out2, data, conv + r * cols); - } - else if(7 == cols) - { - convolve_row7x1(out, out2, data, conv + r * cols); - } - else if(9 == cols) - { - convolve_row9x1(out, out2, data, conv + r * cols); - } - else - { - ARM_COMPUTE_ERROR("Unsupported number of columns"); - } - } - - // Apply scale - if(_scale != 1) - { - // Convert to F32, scale and convert back to S32 - out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); - out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); - } - - // Clamp and store as U8 or S16: - store_results(out, out2, reinterpret_cast(output.ptr())); - }, - input, output); -} -} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEConvolutionKernel.h b/src/core/NEON/kernels/NEConvolutionKernel.h deleted file mode 100644 index b8bf1d169e..0000000000 --- a/src/core/NEON/kernels/NEConvolutionKernel.h +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL_H -#define ARM_COMPUTE_NECONVOLUTIONKERNEL_H - -#include "src/core/NEON/INEKernel.h" -#include "src/core/NEON/INESimpleKernel.h" - -#include -#include -#include - -namespace arm_compute -{ -class ITensor; - -/****************************************************************************************\ - * Square Convolution * -\****************************************************************************************/ - -/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9). - * The client can supply a convolution matrix \f$ C_{m,n} \f$. - * @f{eqnarray}{ - * k_0 &=& \frac{m}{2} \\ - * l_0 &=& \frac{n}{2} \\ - * sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l} - * @f} - * - * @note The above equation for this function is similar to the default OpenCV Filter2D function, - * which actually computes a correlation and not a convolution. - * In case of a real convolution the convolution matrix should be flipped both horizontally and vertically. - */ -template -class NEConvolutionKernel : public INESimpleKernel -{ -public: - const char *name() const override - { - return "NEConvolutionKernel"; - } - /** Default constructor */ - NEConvolutionKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - NEConvolutionKernel(const NEConvolutionKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - NEConvolutionKernel &operator=(const NEConvolutionKernel &) = delete; - /** Allow instances of this class to be moved */ - NEConvolutionKernel(NEConvolutionKernel &&) = default; - /** Allow instances of this class to be moved */ - NEConvolutionKernel &operator=(NEConvolutionKernel &&) = default; - /** Default destructor */ - ~NEConvolutionKernel() = default; - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source tensor. Data type supported: U8. - * @param[out] output Destination tensor. Data types supported: U8, S16. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -private: - template - void convolution(const Window &win); - -protected: - uint32_t _scale; /**< scale of the convolution */ - std::array _convolution; /**< convolution matrix */ -}; - -/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/ -using NEConvolution3x3Kernel = NEConvolutionKernel<3>; -/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/ -using NEConvolution5x5Kernel = NEConvolutionKernel<5>; -/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/ -using NEConvolution7x7Kernel = NEConvolutionKernel<7>; -///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/ -using NEConvolution9x9Kernel = NEConvolutionKernel<9>; - -/****************************************************************************************\ - * Separable Square Convolution * -\****************************************************************************************/ - -/** Kernel for the Horizontal pass of a Separable Convolution */ -template -class NESeparableConvolutionHorKernel : public INESimpleKernel -{ -public: - const char *name() const override - { - return "NESeparableConvolutionHorKernel"; - } - /** Default constructor */ - NESeparableConvolutionHorKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - NESeparableConvolutionHorKernel(const NESeparableConvolutionHorKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - NESeparableConvolutionHorKernel &operator=(const NESeparableConvolutionHorKernel &) = delete; - /** Allow instances of this class to be moved */ - NESeparableConvolutionHorKernel(NESeparableConvolutionHorKernel &&) = default; - /** Allow instances of this class to be moved */ - NESeparableConvolutionHorKernel &operator=(NESeparableConvolutionHorKernel &&) = default; - /** Default destructor */ - ~NESeparableConvolutionHorKernel() = default; - - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source tensor. Data type supported: U8. - * @param[out] output Destination tensor. Data types supported: U16, S16, S32. - * @param[in] conv_row Convolution matrix to apply to the input tensor. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -private: - /** Apply the object's convolution to the given window of the input tensor.. - * - * @param[in] window Window to apply the convolution on. - */ - template - void convolve(const Window &window); - - std::array _conv_row; /**< Convolution coefficients */ - BorderSize _border_size; /**< Border size */ -}; - -/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/ -using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>; -/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/ -using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>; -/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/ -using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>; - -/** Kernel for the Vertical pass of a Separable Convolution */ -template -class NESeparableConvolutionVertKernel : public INESimpleKernel -{ -public: - const char *name() const override - { - return "NESeparableConvolutionVertKernel"; - } - /** Default constructor */ - NESeparableConvolutionVertKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers). */ - NESeparableConvolutionVertKernel(const NESeparableConvolutionVertKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers). */ - NESeparableConvolutionVertKernel &operator=(const NESeparableConvolutionVertKernel &) = delete; - /** Allow instances of this class to be moved */ - NESeparableConvolutionVertKernel(NESeparableConvolutionVertKernel &&) = default; - /** Allow instances of this class to be moved */ - NESeparableConvolutionVertKernel &operator=(NESeparableConvolutionVertKernel &&) = default; - /** Default destructor */ - ~NESeparableConvolutionVertKernel() = default; - - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source tensor. Data type supported: U16, S16, S32. - * @param[out] output Destination tensor, Data types supported: U8, S16. - * @param[in] conv_col Convolution matrix to apply to the input tensor. - * @param[in] scale Scale of the convolution matrix - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -private: - /** Apply the object's convolution to the given window of the input tensor. - * This function is used if the intermediate values have been stored as U16. - * - * @param[in] win Window to apply the convolution on. - */ - template - void convolution_u16(const Window &win); - /** Apply the object's convolution to the given window of the input tensor. - * This function is used if the intermediate values have been stored as S16. - * - * @param[in] win Window to apply the convolution on. - */ - template - void convolution_s16(const Window &win); - /** Apply the object's convolution to the given window of the input tensor. - * This function is used if the intermediate values have been stored as S32. - * - * @param[in] win Window to apply the convolution on. - */ - template - void convolution_s32(const Window &win); - - std::array _conv_col; /**< Convolution coefficients */ - uint32_t _scale; /**< Convolution's scale */ -}; - -/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/ -using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>; -/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/ -using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>; -/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/ -using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>; - -/****************************************************************************************\ - * Rectangle Convolution * -\****************************************************************************************/ - -/** Kernel for the running convolution on a rectangle matrix. - * - * @note Supports combinations of 3,5,7 and 9. - */ -class NEConvolutionRectangleKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEConvolutionRectangleKernel"; - } - /** Default constructor */ - NEConvolutionRectangleKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete; - /** Allow instances of this class to be moved */ - NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default; - /** Allow instances of this class to be moved */ - NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default; - /** Default destructor */ - ~NEConvolutionRectangleKernel() = default; - /** Initialise the kernel's input, output and border mode. - * - * @param[in] input Source tensor. Data type supported: U8. - * @param[out] output Destination tensor, Data types supported: U8, S16. - * @param[in] conv Convolution matrix to apply to the input tensor. - * @param[in] width Width of convolution matrix (Number of columns) - * @param[in] height Height of convolution matrix (Number of rows) - * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -private: - unsigned int get_index(uint32_t val); - /** Apply the object's convolution to the given window of the input tensor. - * - * @param[in] win Window to apply the convolution on. - */ - template - void convolution(const Window &win); - -protected: - const ITensor *_input; /**< Input tensor */ - ITensor *_output; /**< Output tensor */ - uint32_t _scale; /**< Scale of the convolution */ - std::vector _convolution; /**< Convolution matrix */ - BorderSize _border_size; /**< Calculated border width */ - uint32_t _func_idx; /**< Index used to specify convolution function to be used */ - const static unsigned int _nr_supported_sizes - { - 4 - }; /**< Number of supported permutations */ -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_NECONVOLUTIONKERNEL_H */ diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp deleted file mode 100644 index 9f5dfcdcdb..0000000000 --- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp +++ /dev/null @@ -1,516 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include - -using namespace arm_compute; - -namespace arm_compute -{ -class Coordinates; -} // namespace arm_compute - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -namespace fp16 -{ -inline void mask_top(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask) -{ - // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2] - mask = vandq_u16(mask, vcgeq_f16(vc, in0)); - mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 1))); - mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 2))); -} - -inline void mask_middle(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask) -{ - // vc >= nc.val[0], vc > nc.val[2] - mask = vandq_u16(mask, vcgeq_f16(vc, in0)); - mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2))); -} - -inline void mask_bottom(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask) -{ - // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2] - mask = vandq_u16(mask, vcgtq_f16(vc, in0)); - mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 1))); - mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2))); -} - -inline void non_maxima_suppression3x3_F32_F32(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride) -{ - auto in = static_cast(in_ptr) - 1; - const auto out = static_cast(out_ptr); - - // Get centre scores - const float16x8x2_t vc = - { - vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 1)), vcvt_f16_f32(vld1q_f32(in + 5))), - vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 9)), vcvt_f16_f32(vld1q_f32(in + 13))) - }; - - // Neighboring pixels - in -= in_stride; - - static const float16x4_t zero_f16x4 = vdup_n_f16(0); - static const uint16x8_t zero_u16 = vdupq_n_u16(0); - static const uint16x8_t true_mask = vceqq_u16(zero_u16, zero_u16); - static const uint16x8x2_t true_mask_x2 = - { - true_mask, - true_mask - }; - - uint16x8x2_t mask = true_mask_x2; - - // Top row - const float16x8_t tmp_top0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4))); - const float16x8_t tmp_top1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12))); - const float16x8_t tmp_top2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4); - - // vc >= nc.val[0], vc >= nc.val[1], vc >= nc.val[2] - mask_top(vc.val[0], tmp_top0, tmp_top1, mask.val[0]); - mask_top(vc.val[1], tmp_top1, tmp_top2, mask.val[1]); - - in += in_stride; - - // Middle row - const float16x8_t tmp_mid0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4))); - const float16x8_t tmp_mid1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12))); - const float16x8_t tmp_mid2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4); - - // vc >= nc.val[0], vc > nc.val[2] - mask_middle(vc.val[0], tmp_mid0, tmp_mid1, mask.val[0]); - mask_middle(vc.val[1], tmp_mid1, tmp_mid2, mask.val[1]); - - in += in_stride; - - // Bottom row - const float16x8_t tmp_bot0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4))); - const float16x8_t tmp_bot1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12))); - const float16x8_t tmp_bot2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4); - - // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2] - mask_bottom(vc.val[0], tmp_bot0, tmp_bot1, mask.val[0]); - mask_bottom(vc.val[1], tmp_bot1, tmp_bot2, mask.val[1]); - - // Store - static const float16x8_t zero_f16x8 = vdupq_n_f16(0); - - const float16x8_t suppressed0 = vbslq_f16(mask.val[0], vc.val[0], zero_f16x8); - vst1q_f32(out + 0, vcvt_f32_f16(vget_low_f16(suppressed0))); - vst1q_f32(out + 4, vcvt_f32_f16(vget_high_f16(suppressed0))); - - const float16x8_t suppressed1 = vbslq_f16(mask.val[1], vc.val[1], zero_f16x8); - vst1q_f32(out + 8, vcvt_f32_f16(vget_low_f16(suppressed1))); - vst1q_f32(out + 12, vcvt_f32_f16(vget_high_f16(suppressed1))); -} - -inline void non_maxima_suppression3x3_U8_U8(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride) -{ - auto in = static_cast(in_ptr) - 1; - const auto out = static_cast(out_ptr); - - // Get centre scores - const uint8x16_t vc = vld1q_u8(in + 1); - - // Neighboring pixels - in -= in_stride; - - // Top row - const uint8x16_t l_nc_0 = vld1q_u8(in); - const uint8x16_t m_nc_0 = vld1q_u8(in + 1); - const uint8x16_t r_nc_0 = vld1q_u8(in + 2); - - // Keep center scores if ... - // vc >= l_nc_0, vc >= m_nc_0, vc >= r_nc_0 - uint8x16_t mask = vcgeq_u8(vc, l_nc_0); - mask = vandq_u8(mask, vcgeq_u8(vc, m_nc_0)); - mask = vandq_u8(mask, vcgeq_u8(vc, r_nc_0)); - - in += in_stride; - - // Middle row - const uint8x16_t l_nc_1 = vld1q_u8(in); - const uint8x16_t r_nc_1 = vld1q_u8(in + 2); - - // ... and ... - // vc >= l_nc_1, vc > r_nc_1 - mask = vandq_u8(mask, vcgeq_u8(vc, l_nc_1)); - mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_1)); - - in += in_stride; - - // Bottom row - const uint8x16_t l_nc_2 = vld1q_u8(in); - const uint8x16_t m_nc_2 = vld1q_u8(in + 1); - const uint8x16_t r_nc_2 = vld1q_u8(in + 2); - - // ... and ... - // vc > l_nc_2, vc > m_nc_2, vc > r_nc_2 - mask = vandq_u8(mask, vcgtq_u8(vc, l_nc_2)); - mask = vandq_u8(mask, vcgtq_u8(vc, m_nc_2)); - mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_2)); - - // Store - static const uint8x16_t zero = vdupq_n_u8(0); - vst1q_u8(out, vbslq_u8(mask, vc, zero)); -} -} // namespace fp16 - -void NENonMaximaSuppression3x3FP16Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - _input = input; - _output = output; - - switch(input->info()->data_type()) - { - case DataType::U8: - _func = &fp16::non_maxima_suppression3x3_U8_U8; - break; - default: - _func = &fp16::non_maxima_suppression3x3_F32_F32; - break; - } - - constexpr unsigned int num_elems_processed_per_iteration = 16; - const unsigned int num_elems_read_per_iteration = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3); - constexpr unsigned int num_elems_written_per_iteration = 16; - constexpr unsigned int num_rows_read_per_iteration = 3; - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, - AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), - output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - INEKernel::configure(win); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -namespace -{ -inline void non_maxima_suppression3x3_FLOAT_FLOAT(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride) -{ - auto input = static_cast(input_ptr) - 1; - const auto output = static_cast(output_ptr); - - // Get centre scores - const float32x4x4_t vc = - { - { - vld1q_f32(input + 1), - vld1q_f32(input + 5), - vld1q_f32(input + 9), - vld1q_f32(input + 13) - } - }; - - // Neighboring pixels - float32x4x4_t l_nc{ {} }; - float32x4x4_t m_nc{ {} }; - float32x4x4_t r_nc{ {} }; - - input -= input_stride; - - // Row0 - Low part - float32x4_t tmp_low = vld1q_f32(input); - float32x4_t tmp_high = vld1q_f32(input + 4); - float32x4_t tmp_high1 = vld1q_f32(input + 8); - - l_nc.val[0] = tmp_low; - m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1); - r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2); - - tmp_low = tmp_high; - tmp_high = tmp_high1; - - l_nc.val[1] = tmp_low; - m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1); - r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2); - - // Row0 - High part - tmp_low = tmp_high1; - tmp_high = vld1q_f32(input + 12); - tmp_high1 = vld1q_f32(input + 16); - - l_nc.val[2] = tmp_low; - m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1); - r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2); - - tmp_low = tmp_high; - tmp_high = tmp_high1; - - l_nc.val[3] = tmp_low; - m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1); - r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2); - - // mc >= nc.val[0], mc >= nc.val[1], mc >= nc.val[2] - uint32x4x4_t mask{ {} }; - mask.val[0] = vcgeq_f32(vc.val[0], l_nc.val[0]); - mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], m_nc.val[0])); - mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], r_nc.val[0])); - mask.val[1] = vcgeq_f32(vc.val[1], l_nc.val[1]); - mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], m_nc.val[1])); - mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], r_nc.val[1])); - mask.val[2] = vcgeq_f32(vc.val[2], l_nc.val[2]); - mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], m_nc.val[2])); - mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], r_nc.val[2])); - mask.val[3] = vcgeq_f32(vc.val[3], l_nc.val[3]); - mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], m_nc.val[3])); - mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], r_nc.val[3])); - - input += input_stride; - - // Row1 - Low part - tmp_low = vld1q_f32(input); - tmp_high = vld1q_f32(input + 4); - tmp_high1 = vld1q_f32(input + 8); - - l_nc.val[0] = tmp_low; - r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2); - - tmp_low = tmp_high; - tmp_high = tmp_high1; - - l_nc.val[1] = tmp_low; - r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2); - - // Row1 - High part - tmp_low = tmp_high1; - tmp_high = vld1q_f32(input + 12); - tmp_high1 = vld1q_f32(input + 16); - - l_nc.val[2] = tmp_low; - r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2); - - tmp_low = tmp_high; - tmp_high = tmp_high1; - - l_nc.val[3] = tmp_low; - r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2); - - // mc >= nc.val[0], mc > nc.val[2] - mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], l_nc.val[0])); - mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0])); - mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], l_nc.val[1])); - mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1])); - mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], l_nc.val[2])); - mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2])); - mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], l_nc.val[3])); - mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3])); - - input += input_stride; - - // Row2 - Low part - tmp_low = vld1q_f32(input); - tmp_high = vld1q_f32(input + 4); - tmp_high1 = vld1q_f32(input + 8); - - l_nc.val[0] = tmp_low; - m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1); - r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2); - - tmp_low = tmp_high; - tmp_high = tmp_high1; - - l_nc.val[1] = tmp_low; - m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1); - r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2); - - // Row2 - High part - tmp_low = tmp_high1; - tmp_high = vld1q_f32(input + 12); - tmp_high1 = vld1q_f32(input + 16); - - l_nc.val[2] = tmp_low; - m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1); - r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2); - - tmp_low = tmp_high; - tmp_high = tmp_high1; - - l_nc.val[3] = tmp_low; - m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1); - r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2); - - // mc > nc.val[0], mc > nc.val[1], mc > nc.val[2] - mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], l_nc.val[0])); - mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], m_nc.val[0])); - mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0])); - mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], l_nc.val[1])); - mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], m_nc.val[1])); - mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1])); - mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], l_nc.val[2])); - mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], m_nc.val[2])); - mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2])); - mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], l_nc.val[3])); - mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], m_nc.val[3])); - mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3])); - - static const float32x4_t zero = vdupq_n_f32(0.f); - - // Store - vst1q_f32(output + 0, vbslq_f32(mask.val[0], vc.val[0], zero)); - vst1q_f32(output + 4, vbslq_f32(mask.val[1], vc.val[1], zero)); - vst1q_f32(output + 8, vbslq_f32(mask.val[2], vc.val[2], zero)); - vst1q_f32(output + 12, vbslq_f32(mask.val[3], vc.val[3], zero)); -} - -inline void non_maxima_suppression3x3_U8_U8(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride) -{ - auto input = static_cast(input_ptr) - 1; - const auto output = static_cast(output_ptr); - - // Get centre scores - const uint8x16_t vc = vld1q_u8(input + 1); - - // Neighboring pixels - uint8x16_t l_nc{}; - uint8x16_t m_nc{}; - uint8x16_t r_nc{}; - - input -= input_stride; - - // Row0 - l_nc = vld1q_u8(input); - m_nc = vld1q_u8(input + 1); - r_nc = vld1q_u8(input + 2); - - // mc >= l_nc, mc >= m_nc, mc >= r_nc - uint8x16_t mask = vcgeq_u8(vc, l_nc); - mask = vandq_u8(mask, vcgeq_u8(vc, m_nc)); - mask = vandq_u8(mask, vcgeq_u8(vc, r_nc)); - - input += input_stride; - - // Row1 - l_nc = vld1q_u8(input); - r_nc = vld1q_u8(input + 2); - - // mc >= l_nc, mc > r_nc - mask = vandq_u8(mask, vcgeq_u8(vc, l_nc)); - mask = vandq_u8(mask, vcgtq_u8(vc, r_nc)); - - input += input_stride; - - // Row2 - l_nc = vld1q_u8(input); - m_nc = vld1q_u8(input + 1); - r_nc = vld1q_u8(input + 2); - - // mc > l_nc, mc > m_nc, mc > r_nc - mask = vandq_u8(mask, vcgtq_u8(vc, l_nc)); - mask = vandq_u8(mask, vcgtq_u8(vc, m_nc)); - mask = vandq_u8(mask, vcgtq_u8(vc, r_nc)); - - static const uint8x16_t zero = vdupq_n_u8(0); - - // Store - vst1q_u8(output, vbslq_u8(mask, vc, zero)); -} -} // namespace - -NENonMaximaSuppression3x3Kernel::NENonMaximaSuppression3x3Kernel() - : _func(nullptr), _input(nullptr), _output(nullptr) -{ -} - -BorderSize NENonMaximaSuppression3x3Kernel::border_size() const -{ - return BorderSize(1); -} - -void NENonMaximaSuppression3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - _input = input; - _output = output; - - if(input->info()->data_type() == DataType::U8) - { - _func = &non_maxima_suppression3x3_U8_U8; - } - else - { - _func = &non_maxima_suppression3x3_FLOAT_FLOAT; - } - - constexpr unsigned int num_elems_processed_per_iteration = 16; - const unsigned int num_elems_read_per_iteration = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3); - constexpr unsigned int num_elems_written_per_iteration = 16; - constexpr unsigned int num_rows_read_per_iteration = 3; - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); - - update_window_and_padding(win, - AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), - output_access); - - output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); - - INEKernel::configure(win); -} - -void NENonMaximaSuppression3x3Kernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - Iterator input(_input, window); - Iterator output(_output, window); - - const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type()); - - execute_window_loop(window, [&](const Coordinates &) - { - _func(input.ptr(), output.ptr(), input_stride); - }, - input, output); -} diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h deleted file mode 100644 index 4194dac68e..0000000000 --- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H -#define ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H - -#include "src/core/NEON/INEKernel.h" - -#include - -namespace arm_compute -{ -class ITensor; - -/** Interface to perform Non-Maxima suppression over a 3x3 window using Neon - * - */ -class NENonMaximaSuppression3x3Kernel : public INEKernel -{ -public: - const char *name() const override - { - return "NENonMaximaSuppression3x3Kernel"; - } - /** Default constructor */ - NENonMaximaSuppression3x3Kernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete; - /** Allow instances of this class to be moved */ - NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default; - /** Allow instances of this class to be moved */ - NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default; - /** Default destructor */ - ~NENonMaximaSuppression3x3Kernel() = default; - - /** Initialise the kernel's sources, destinations and border mode. - * - * @param[in] input Source tensor. Data types supported: U8/F32 - * @param[out] output Destination tensor. Data types supported: same as @p input - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ITensor *input, ITensor *output, bool border_undefined); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -protected: - /** Common signature for all the specialised non-maxima suppression 3x3 functions - * - * @param[in] input_ptr Pointer to the input tensor. - * @param[out] output_ptr Pointer to the output tensor - * @param[in] input_stride Stride of the input tensor - */ - using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride); - - NonMaxSuppr3x3Function *_func; /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */ - const ITensor *_input; /**< Source tensor */ - ITensor *_output; /**< Destination tensor */ -}; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -/** Neon kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32 - */ -class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel -{ -public: - const char *name() const override - { - return "NENonMaximaSuppression3x3FP16Kernel"; - } - /** Initialise the kernel's sources, destinations and border mode. - * - * @param[in] input Source tensor. Data types supported: U8/F32. - * @param[out] output Destination tensor. Data types supported: same as @p input - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. - */ - void configure(const ITensor *input, ITensor *output, bool border_undefined); -}; -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -/** Neon kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32 */ -using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -} // namespace arm_compute -#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H */ diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp new file mode 100644 index 0000000000..24d0dd82e8 --- /dev/null +++ b/src/core/NEON/kernels/NERemapKernel.cpp @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/NEON/kernels/NERemapKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/helpers/WindowHelpers.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const float32x4_t &width, const float32x4_t &height, const int32x4_t &stride) +{ + const float32x4_t lowerxy = vdupq_n_f32(-1.f); + + float32x4_t x = vld1q_f32(mapx_ptr); + float32x4_t y = vld1q_f32(mapy_ptr); + + // Clamp x coordinates + x = vmaxq_f32(lowerxy, vminq_f32(x, width)); + y = vmaxq_f32(lowerxy, vminq_f32(y, height)); + + const int32x4_t x_s32 = vcvtq_s32_f32(x); + const int32x4_t y_s32 = vcvtq_s32_f32(y); + + return vmlaq_s32(x_s32, y_s32, stride); +} + +} // namespace + +NERemapKernel::NERemapKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr) +{ +} + +BorderSize NERemapKernel::border_size() const +{ + return BorderSize(1); +} + +void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); + + _input = input; + _output = output; + _map_x = map_x; + _map_y = map_y; + + switch(policy) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + { + _func = &NERemapKernel::remap_nearest; + break; + } + case InterpolationPolicy::BILINEAR: + { + _func = &NERemapKernel::remap_bilinear; + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + break; + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + const int total_right = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration); + const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0); + + AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom); + + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal mapx_access(map_x->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal mapy_access(map_y->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NERemapKernel::remap_nearest(const Window &window) +{ + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + Iterator mapx(_map_x, window); + Iterator mapy(_map_y, window); + + const float32x4_t width = vdupq_n_f32(static_cast(_input->info()->dimension(0))); + const float32x4_t height = vdupq_n_f32(static_cast(_input->info()->dimension(1))); + const int32x4_t in_stride = vdupq_n_s32(static_cast(_input->info()->strides_in_bytes()[1])); + + execute_window_loop(window, [&](const Coordinates &) + { + const auto mapx_ptr = reinterpret_cast(mapx.ptr()); + const auto mapy_ptr = reinterpret_cast(mapy.ptr()); + const uint8_t *in_ptr = in.ptr(); + + const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr + 0, mapy_ptr + 0, width, height, in_stride); + const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, width, height, in_stride); + const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride); + const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride); + + uint8x16_t tmp = vdupq_n_u8(0); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp, 8); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp, 9); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp, 10); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp, 11); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp, 12); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp, 13); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp, 14); + tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp, 15); + vst1q_u8(out.ptr(), tmp); + }, + in, out, mapx, mapy); +} + +void NERemapKernel::remap_bilinear(const Window &window) +{ + using namespace scale_helpers; + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + Iterator mapx(_map_x, window); + Iterator mapy(_map_y, window); + + const size_t width = _input->info()->dimension(0); + const size_t height = _input->info()->dimension(1); + const size_t in_stride = _input->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates &) + { + const auto mapx_ptr = reinterpret_cast(mapx.ptr()); + const auto mapy_ptr = reinterpret_cast(mapy.ptr()); + const uint8_t *in_ptr = in.ptr(); + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7); + + vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); + }, + in, out, mapx, mapy); +} + +void NERemapKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h new file mode 100644 index 0000000000..adc7f4bdd5 --- /dev/null +++ b/src/core/NEON/kernels/NERemapKernel.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NEREMAPKERNEL_H +#define ARM_COMPUTE_NEREMAPKERNEL_H + +#include "arm_compute/core/Types.h" +#include "src/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Neon kernel to perform a remap on a tensor */ +class NERemapKernel : public INEKernel +{ +public: + const char *name() const override + { + return "NERemapKernel"; + } + /** Default constructor */ + NERemapKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NERemapKernel(const NERemapKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NERemapKernel &operator=(const NERemapKernel &) = delete; + /** Allow instances of this class to be moved */ + NERemapKernel(NERemapKernel &&) = default; + /** Allow instances of this class to be moved */ + NERemapKernel &operator=(NERemapKernel &&) = default; + /** Default destructor */ + ~NERemapKernel() = default; + + /** Initialize the kernel's input, output and border mode. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[in] map_x Map for X coordinates. Data type supported: F32. + * @param[in] map_y Map for Y coordinates. Data type supported: F32. + * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane. + * @param[in] policy The interpolation type. + */ + void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; + +private: + /** function to perform nearest interpolation on the given window */ + void remap_nearest(const Window &window); + /** function to perform bilinear interpolation on the given window */ + void remap_bilinear(const Window &window); + /** Remap function to use for the particular interpolation type passed to configure() */ + void (NERemapKernel::*_func)(const Window &window); + + const ITensor *_input; /**< Input image */ + ITensor *_output; /**< Output image */ + const ITensor *_map_x; /**< Input remap x coordinates */ + const ITensor *_map_y; /**< Input remap y coordinates */ +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */ diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp deleted file mode 100644 index ff5b0a864d..0000000000 --- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h" - -#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h" - -#include - -using namespace arm_compute; - -void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); -} - -void CLAbsoluteDifference::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, input1, input2, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp deleted file mode 100644 index 44020fd816..0000000000 --- a/src/runtime/CL/functions/CLAccumulate.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLAccumulate.h" - -#include "src/core/CL/kernels/CLAccumulateKernel.h" - -#include - -using namespace arm_compute; - -void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, accum); -} - -void CLAccumulate::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, accum); - _kernel = std::move(k); -} - -void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum); -} - -void CLAccumulateWeighted::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, alpha, accum); - _kernel = std::move(k); -} - -void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum); -} - -void CLAccumulateSquared::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, shift, accum); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp deleted file mode 100644 index 09e24d1bc0..0000000000 --- a/src/runtime/CL/functions/CLBox3x3.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLBox3x3.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLBox3x3Kernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" - -#include - -using namespace arm_compute; - -void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp deleted file mode 100644 index 7e99a1bbb3..0000000000 --- a/src/runtime/CL/functions/CLCannyEdge.cpp +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLCannyEdge.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/functions/CLSobel3x3.h" -#include "arm_compute/runtime/CL/functions/CLSobel5x5.h" -#include "arm_compute/runtime/CL/functions/CLSobel7x7.h" -#include "src/core/CL/kernels/CLCannyEdgeKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLSobel5x5Kernel.h" -#include "src/core/CL/kernels/CLSobel7x7Kernel.h" - -using namespace arm_compute; - -CLCannyEdge::CLCannyEdge(std::shared_ptr memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _sobel(), - _gradient(std::make_unique()), - _border_mag_gradient(std::make_unique()), - _non_max_suppr(std::make_unique()), - _edge_trace(std::make_unique()), - _gx(), - _gy(), - _mag(), - _phase(), - _nonmax(), - _visited(), - _recorded(), - _l1_list_counter(), - _l1_stack(), - _output(nullptr) -{ -} - -CLCannyEdge::~CLCannyEdge() = default; - -void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, - uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, gradient_size, norm_type, border_mode, constant_border_value); -} - -void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, - BorderMode border_mode, - uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type)); - ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7)); - ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr)); - - _output = output; - - const unsigned int L1_hysteresis_stack_size = 8; - const TensorShape shape = input->info()->tensor_shape(); - - TensorInfo gradient_info; - TensorInfo info; - - // Initialize images - if(gradient_size < 7) - { - gradient_info.init(shape, 1, arm_compute::DataType::S16); - info.init(shape, 1, arm_compute::DataType::U16); - } - else - { - gradient_info.init(shape, 1, arm_compute::DataType::S32); - info.init(shape, 1, arm_compute::DataType::U32); - } - - _gx.allocator()->init(gradient_info); - _gy.allocator()->init(gradient_info); - _mag.allocator()->init(info); - _nonmax.allocator()->init(info); - - TensorInfo info_u8(shape, 1, arm_compute::DataType::U8); - _phase.allocator()->init(info_u8); - _l1_list_counter.allocator()->init(info_u8); - - TensorInfo info_u32(shape, 1, arm_compute::DataType::U32); - _visited.allocator()->init(info_u32); - _recorded.allocator()->init(info_u32); - - TensorShape shape_l1_stack = input->info()->tensor_shape(); - shape_l1_stack.set(0, input->info()->dimension(0) * L1_hysteresis_stack_size); - TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32); - _l1_stack.allocator()->init(info_s32); - - // Manage intermediate buffers - _memory_group.manage(&_gx); - _memory_group.manage(&_gy); - - // Configure/Init sobelNxN - if(gradient_size == 3) - { - auto k = std::make_unique(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - } - else if(gradient_size == 5) - { - auto k = std::make_unique(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - } - else if(gradient_size == 7) - { - auto k = std::make_unique(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - } - else - { - ARM_COMPUTE_ERROR_VAR("Gradient size %d not supported", gradient_size); - } - - // Manage intermediate buffers - _memory_group.manage(&_mag); - _memory_group.manage(&_phase); - - // Configure gradient - _gradient->configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type); - - // Allocate intermediate buffers - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); - - // Manage intermediate buffers - _memory_group.manage(&_nonmax); - - // Configure non-maxima suppression - _non_max_suppr->configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED); - - // Allocate intermediate buffers - _phase.allocator()->allocate(); - - // Fill border around magnitude image as non-maxima suppression will access - // it. If border mode is undefined filling the border is a nop. - _border_mag_gradient->configure(compile_context, &_mag, _non_max_suppr->border_size(), border_mode, constant_border_value); - - // Allocate intermediate buffers - _mag.allocator()->allocate(); - - // Manage intermediate buffers - _memory_group.manage(&_visited); - _memory_group.manage(&_recorded); - _memory_group.manage(&_l1_stack); - _memory_group.manage(&_l1_list_counter); - - // Configure edge tracing - _edge_trace->configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter); - - // Allocate intermediate buffers - _visited.allocator()->allocate(); - _recorded.allocator()->allocate(); - _l1_stack.allocator()->allocate(); - _l1_list_counter.allocator()->allocate(); - _nonmax.allocator()->allocate(); -} - -void CLCannyEdge::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run sobel - _sobel->run(); - - // Run phase and magnitude calculation - CLScheduler::get().enqueue(*_gradient, false); - - // Fill border before non-maxima suppression. Nop for border mode undefined. - CLScheduler::get().enqueue(*_border_mag_gradient, false); - - // Run non max suppresion - _nonmax.clear(CLScheduler::get().queue()); - CLScheduler::get().enqueue(*_non_max_suppr, false); - - // Clear temporary structures and run edge trace - _output->clear(CLScheduler::get().queue()); - _visited.clear(CLScheduler::get().queue()); - _recorded.clear(CLScheduler::get().queue()); - _l1_list_counter.clear(CLScheduler::get().queue()); - _l1_stack.clear(CLScheduler::get().queue()); - CLScheduler::get().enqueue(*_edge_trace, true); -} diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp deleted file mode 100644 index 543de9c653..0000000000 --- a/src/runtime/CL/functions/CLChannelCombine.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLChannelCombine.h" - -#include "src/core/CL/kernels/CLChannelCombineKernel.h" - -#include - -using namespace arm_compute; - -void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output); -} - -void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, plane0, plane1, plane2, plane3, output); - _kernel = std::move(k); -} - -void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output); -} - -void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, plane0, plane1, plane2, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp deleted file mode 100644 index 645fc051cb..0000000000 --- a/src/runtime/CL/functions/CLChannelExtract.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLChannelExtract.h" - -#include "src/core/CL/kernels/CLChannelExtractKernel.h" - -#include - -using namespace arm_compute; - -void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, channel, output); -} - -void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, channel, output); - _kernel = std::move(k); -} - -void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, channel, output); -} - -void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, channel, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp deleted file mode 100644 index 9aeeb65dc4..0000000000 --- a/src/runtime/CL/functions/CLColorConvert.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLColorConvert.h" - -#include "src/core/CL/kernels/CLColorConvertKernel.h" - -#include - -using namespace arm_compute; - -void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output); - _kernel = std::move(k); -} - -void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output); - _kernel = std::move(k); -} - -void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output); - _kernel = std::move(k); -} - -void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp deleted file mode 100644 index ffc7cda034..0000000000 --- a/src/runtime/CL/functions/CLConvolution.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLConvolution.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" -#include "src/core/CL/kernels/CLConvolutionKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" - -#include - -using namespace arm_compute; - -void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value); -} - -void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, - uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} - -template -CLConvolutionSquare::CLConvolutionSquare(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(std::make_unique>()), - _kernel_vert(std::make_unique>()), _kernel(std::make_unique>()), _border_handler(std::make_unique()) -{ -} - -template -CLConvolutionSquare::~CLConvolutionSquare() = default; - -template -void CLConvolutionSquare::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, - uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value); -} - -template -void CLConvolutionSquare::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, - uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(conv == nullptr); - std::array conv_col{ 0 }; - std::array conv_row{ 0 }; - _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size); - - if(_is_separable) - { - std::pair type_pair = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size); - _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first)); - - // Manage intermediate buffers - _memory_group.manage(&_tmp); - - if(scale == 0) - { - scale = calculate_matrix_scale(conv, matrix_size); - } - - _kernel_hor->configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED); - _kernel_vert->configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second); - _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value)); - - // Allocate intermediate buffer - _tmp.allocator()->allocate(); - } - else - { - _kernel->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); - } -} - -template -void CLConvolutionSquare::run() -{ - CLScheduler::get().enqueue(*_border_handler); - - if(_is_separable) - { - MemoryGroupResourceScope scope_mg(_memory_group); - - CLScheduler::get().enqueue(*_kernel_hor, false); - CLScheduler::get().enqueue(*_kernel_vert); - } - else - { - CLScheduler::get().enqueue(*_kernel); - } -} - -template class arm_compute::CLConvolutionSquare<5>; -template class arm_compute::CLConvolutionSquare<7>; -template class arm_compute::CLConvolutionSquare<9>; - -void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, rows, cols, scale, border_mode, constant_border_value); -} - -void CLConvolutionRectangle::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, - BorderMode border_mode, uint8_t constant_border_value) -{ - border_mode = (border_mode == BorderMode::UNDEFINED) ? BorderMode::CONSTANT : border_mode; - auto k = std::make_unique(); - k->configure(compile_context, input, output, conv, rows, cols, scale, false); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp deleted file mode 100644 index 2e3ecf7700..0000000000 --- a/src/runtime/CL/functions/CLDerivative.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLDerivative.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLDerivativeKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" - -#include - -using namespace arm_compute; - -void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp deleted file mode 100644 index 92c5cc7ab1..0000000000 --- a/src/runtime/CL/functions/CLDilate.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLDilate.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLDilateKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" - -#include - -using namespace arm_compute; - -void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp deleted file mode 100644 index 11607cf71d..0000000000 --- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h" - -#include "arm_compute/core/CL/ICLDistribution1D.h" -#include "arm_compute/core/CL/ICLLut.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLHistogramKernel.h" -#include "src/core/CL/kernels/CLTableLookupKernel.h" - -#include -#include -#include -#include - -using namespace arm_compute; - -namespace -{ -void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_dist, CLLut &lut) -{ - dist.map(true); - cum_dist.map(true); - lut.map(true); - - const uint32_t *dist_ptr = dist.buffer(); - uint32_t *cum_dist_ptr = cum_dist.buffer(); - uint8_t *lut_ptr = lut.buffer(); - - ARM_COMPUTE_ERROR_ON(dist_ptr == nullptr); - ARM_COMPUTE_ERROR_ON(cum_dist_ptr == nullptr); - ARM_COMPUTE_ERROR_ON(lut_ptr == nullptr); - - // Calculate cumulative distribution - std::partial_sum(dist_ptr, dist_ptr + 256, cum_dist_ptr); - - // Get the number of pixels that have the lowest value in the input image - const uint32_t num_lowest_pixels = *std::find_if(dist_ptr, dist_ptr + 256, [](const uint32_t &v) - { - return v > 0; - }); - const size_t image_size = cum_dist_ptr[255]; - - if(image_size == num_lowest_pixels) - { - std::iota(lut_ptr, lut_ptr + 256, 0); - } - else - { - const float diff = image_size - num_lowest_pixels; - - for(size_t i = 0; i < 256; ++i) - { - lut_ptr[i] = lround((cum_dist_ptr[i] - num_lowest_pixels) / diff * 255.f); - } - } - - dist.unmap(); - cum_dist.unmap(); - lut.unmap(); -} -} // namespace - -CLEqualizeHistogram::CLEqualizeHistogram() - : _histogram_kernel(std::make_unique()), - _border_histogram_kernel(std::make_unique()), - _map_histogram_kernel(std::make_unique()), - _hist(nr_bins, 0, max_range), - _cum_dist(nr_bins, 0, max_range), - _cd_lut(nr_bins, DataType::U8) -{ -} - -CLEqualizeHistogram::~CLEqualizeHistogram() = default; - -void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output) -{ - _histogram_kernel->configure(compile_context, input, &_hist); - _border_histogram_kernel->configure(compile_context, input, &_hist); - _map_histogram_kernel->configure(compile_context, input, &_cd_lut, output); -} - -void CLEqualizeHistogram::run() -{ - // Calculate histogram of input. - CLScheduler::get().enqueue(*_histogram_kernel, false); - - // Calculate remaining pixels when image is not multiple of the elements of histogram kernel - CLScheduler::get().enqueue(*_border_histogram_kernel, false); - - // Calculate cumulative distribution of histogram and create LUT. - calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut); - - // Map input to output using created LUT. - CLScheduler::get().enqueue(*_map_histogram_kernel); -} diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp deleted file mode 100644 index 29551fc6bd..0000000000 --- a/src/runtime/CL/functions/CLErode.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLErode.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLErodeKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" - -#include - -using namespace arm_compute; - -void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp deleted file mode 100644 index a3a62d6d5e..0000000000 --- a/src/runtime/CL/functions/CLFastCorners.cpp +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLFastCorners.h" - -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" -#include "src/core/CL/kernels/CLFastCornersKernel.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" - -#include -#include - -using namespace arm_compute; - -CLFastCorners::CLFastCorners(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), - _fast_corners_kernel(std::make_unique()), - _suppr_func(), - _copy_array_kernel(std::make_unique()), - _output(), - _suppr(), - _win(), - _non_max(false), - _num_corners(nullptr), - _num_buffer(), - _corners(nullptr), - _constant_border_value(0) -{ -} - -CLFastCorners::~CLFastCorners() = default; - -void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, - unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, threshold, nonmax_suppression, corners, num_corners, border_mode, constant_border_value); -} - -void CLFastCorners::configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, - unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode); - ARM_COMPUTE_ERROR_ON(nullptr == corners); - ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255); - - TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::U8); - _output.allocator()->init(tensor_info); - - _non_max = nonmax_suppression; - _num_corners = num_corners; - _corners = corners; - _num_buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int)); - _constant_border_value = constant_border_value; - - const bool update_number = (nullptr != _num_corners); - - _memory_group.manage(&_output); - _fast_corners_kernel->configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode); - - if(!_non_max) - { - _copy_array_kernel->configure(compile_context, &_output, update_number, _corners, &_num_buffer); - } - else - { - _suppr.allocator()->init(tensor_info); - _memory_group.manage(&_suppr); - - _suppr_func.configure(compile_context, &_output, &_suppr, border_mode); - _copy_array_kernel->configure(compile_context, &_suppr, update_number, _corners, &_num_buffer); - - _suppr.allocator()->allocate(); - } - - // Allocate intermediate tensors - _output.allocator()->allocate(); -} - -void CLFastCorners::run() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - MemoryGroupResourceScope scope_mg(_memory_group); - - if(_non_max) - { - ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function"); - const auto out_buffer = static_cast(q.enqueueMapBuffer(_output.cl_buffer(), CL_TRUE, CL_MAP_WRITE, 0, _output.info()->total_size())); - memset(out_buffer, 0, _output.info()->total_size()); - q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer); - } - - CLScheduler::get().enqueue(*_fast_corners_kernel, false); - - if(_non_max) - { - _suppr_func.run(); - } - - CLScheduler::get().enqueue(*_copy_array_kernel, false); - - unsigned int get_num_corners = 0; - q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners); - - size_t corner_size = std::min(static_cast(get_num_corners), _corners->max_num_values()); - - _corners->resize(corner_size); - - if(_num_corners != nullptr) - { - *_num_corners = get_num_corners; - } - - q.flush(); -} diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp deleted file mode 100644 index 8eeade2f47..0000000000 --- a/src/runtime/CL/functions/CLGaussian3x3.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGaussian3x3Kernel.h" - -#include - -using namespace arm_compute; - -void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp deleted file mode 100644 index ee72fcbe11..0000000000 --- a/src/runtime/CL/functions/CLGaussian5x5.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGaussian5x5Kernel.h" - -#include - -using namespace arm_compute; - -CLGaussian5x5::CLGaussian5x5(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), - _kernel_hor(std::make_unique()), - _kernel_vert(std::make_unique()), - _border_handler(std::make_unique()), - _tmp() -{ -} - -CLGaussian5x5::~CLGaussian5x5() = default; - -void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16)); - - // Manage intermediate buffers - _memory_group.manage(&_tmp); - - // Configure kernels - _kernel_hor->configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED); - _kernel_vert->configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED); - _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value)); - - // Allocate intermediate buffers - _tmp.allocator()->allocate(); -} - -void CLGaussian5x5::run() -{ - CLScheduler::get().enqueue(*_border_handler, false); - - MemoryGroupResourceScope scope_mg(_memory_group); - - CLScheduler::get().enqueue(*_kernel_hor, false); - CLScheduler::get().enqueue(*_kernel_vert); -} diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp deleted file mode 100644 index 9fe35f6f0e..0000000000 --- a/src/runtime/CL/functions/CLGaussianPyramid.cpp +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/runtime/CL/CLPyramid.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/CLTensorAllocator.h" -#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGaussian5x5Kernel.h" -#include "src/core/CL/kernels/CLGaussianPyramidKernel.h" -#include "src/core/CL/kernels/CLScaleKernel.h" - -#include - -using namespace arm_compute; - -CLGaussianPyramid::CLGaussianPyramid() - : _input(nullptr), _pyramid(nullptr), _tmp() -{ -} - -CLGaussianPyramid::~CLGaussianPyramid() = default; - -CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT - : _horizontal_border_handler(), - _vertical_border_handler(), - _horizontal_reduction(), - _vertical_reduction() -{ -} - -CLGaussianPyramidHalf::~CLGaussianPyramidHalf() = default; - -void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value); -} - -void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(pyramid == nullptr); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale()); - - // Constant value to use for vertical fill border when the border mode is CONSTANT - const uint16_t pixel_value_u16 = static_cast(constant_border_value) * 2 + static_cast(constant_border_value) * 8 + static_cast(constant_border_value) * 6; - - /* Get number of pyramid levels */ - const size_t num_levels = pyramid->info()->num_levels(); - - _input = input; - _pyramid = pyramid; - - if(num_levels > 1) - { - _horizontal_border_handler.reserve(num_levels - 1); - _vertical_border_handler.reserve(num_levels - 1); - _horizontal_reduction.reserve(num_levels - 1); - _vertical_reduction.reserve(num_levels - 1); - - // Apply half scale to the X dimension of the tensor shape - TensorShape tensor_shape = pyramid->info()->tensor_shape(); - tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF); - - PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16); - _tmp.init(pyramid_info); - - for(size_t i = 0; i < num_levels - 1; ++i) - { - /* Configure horizontal kernel */ - _horizontal_reduction.emplace_back(std::make_unique()); - _horizontal_reduction.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i)); - - /* Configure vertical kernel */ - _vertical_reduction.emplace_back(std::make_unique()); - _vertical_reduction.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1)); - - /* Configure border */ - _horizontal_border_handler.emplace_back(std::make_unique()); - _horizontal_border_handler.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction.back()->border_size(), border_mode, PixelValue(constant_border_value)); - - /* Configure border */ - _vertical_border_handler.emplace_back(std::make_unique()); - _vertical_border_handler.back()->configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction.back()->border_size(), border_mode, PixelValue(pixel_value_u16)); - } - _tmp.allocate(); - } -} - -void CLGaussianPyramidHalf::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function"); - - /* Get number of pyramid levels */ - const size_t num_levels = _pyramid->info()->num_levels(); - - /* The first level of the pyramid has the input image */ - _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */); - _input->map(CLScheduler::get().queue(), true /* blocking */); - _pyramid->get_pyramid_level(0)->copy_from(*_input); - - _input->unmap(CLScheduler::get().queue()); - _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue()); - - for(unsigned int i = 0; i < num_levels - 1; ++i) - { - CLScheduler::get().enqueue(*_horizontal_border_handler[i], false); - CLScheduler::get().enqueue(*_horizontal_reduction[i], false); - CLScheduler::get().enqueue(*_vertical_border_handler[i], false); - CLScheduler::get().enqueue(*_vertical_reduction[i], false); - } -} - -CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT - : _gauss5x5(), - _scale_nearest() -{ -} - -void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value); -} - -void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale()); - - /* Get number of pyramid levels */ - const size_t num_levels = pyramid->info()->num_levels(); - - _input = input; - _pyramid = pyramid; - - if(num_levels > 1) - { - _gauss5x5.resize(num_levels - 1); - _scale_nearest.reserve(num_levels - 1); - - PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8); - - _tmp.init(pyramid_info); - - for(size_t i = 0; i < num_levels - 1; ++i) - { - /* Configure gaussian 5x5 */ - _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value); - - /* Configure scale image kernel */ - _scale_nearest.emplace_back(std::make_unique()); - _scale_nearest.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER }); - } - - _tmp.allocate(); - } -} - -void CLGaussianPyramidOrb::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function"); - - /* Get number of pyramid levels */ - const size_t num_levels = _pyramid->info()->num_levels(); - - /* The first level of the pyramid has the input image */ - _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */); - _input->map(CLScheduler::get().queue(), true /* blocking */); - _pyramid->get_pyramid_level(0)->copy_from(*_input); - _input->unmap(CLScheduler::get().queue()); - _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue()); - - for(unsigned int i = 0; i < num_levels - 1; ++i) - { - _gauss5x5[i].run(); - CLScheduler::get().enqueue(*_scale_nearest[i]); - } -} diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp deleted file mode 100644 index 8d9ea17d66..0000000000 --- a/src/runtime/CL/functions/CLHOGDescriptor.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/HOGInfo.h" -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLHOGDescriptorKernel.h" -#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" - -using namespace arm_compute; - -CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), - _gradient(), - _orient_bin(std::make_unique()), - _block_norm(std::make_unique()), - _mag(), - _phase(), - _hog_space() -{ -} - -CLHOGDescriptor::~CLHOGDescriptor() = default; - -void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value); -} - -void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(nullptr == output); - ARM_COMPUTE_ERROR_ON(nullptr == hog); - - const HOGInfo *hog_info = hog->info(); - const size_t width = input->info()->dimension(Window::DimX); - const size_t height = input->info()->dimension(Window::DimY); - const size_t num_bins = hog_info->num_bins(); - - Size2D cell_size = hog_info->cell_size(); - - // Calculate number of cells along the x and y directions for the hog_space - const size_t num_cells_x = width / cell_size.width; - const size_t num_cells_y = height / cell_size.height; - - // TensorShape of the input image - const TensorShape &shape_img = input->info()->tensor_shape(); - - // TensorShape of the hog space - TensorShape shape_hog_space = input->info()->tensor_shape(); - shape_hog_space.set(Window::DimX, num_cells_x); - shape_hog_space.set(Window::DimY, num_cells_y); - - // Intitialize tensors for magnitude, phase and hog space - TensorInfo info_mag(shape_img, Format::S16); - _mag.allocator()->init(info_mag); - - TensorInfo info_phase(shape_img, Format::U8); - _phase.allocator()->init(info_phase); - - TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); - _hog_space.allocator()->init(info_space); - - // Manage intermediate buffers - _memory_group.manage(&_mag); - _memory_group.manage(&_phase); - - // Initialise gradient kernel - _gradient.configure(compile_context, input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value); - - // Manage intermediate buffers - _memory_group.manage(&_hog_space); - - // Initialise orientation binning kernel - _orient_bin->configure(compile_context, &_mag, &_phase, &_hog_space, hog->info()); - - // Initialize HOG norm kernel - _block_norm->configure(compile_context, &_hog_space, output, hog->info()); - - // Allocate intermediate tensors - _mag.allocator()->allocate(); - _phase.allocator()->allocate(); - _hog_space.allocator()->allocate(); -} - -void CLHOGDescriptor::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run gradient - _gradient.run(); - - // Run orientation binning - CLScheduler::get().enqueue(*_orient_bin, false); - - // Run block normalization - CLScheduler::get().enqueue(*_block_norm); -} \ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp deleted file mode 100644 index 365021c723..0000000000 --- a/src/runtime/CL/functions/CLHOGDetector.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHOGDetector.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLHOGDetectorKernel.h" - -#include - -using namespace arm_compute; - -CLHOGDetector::CLHOGDetector() - : _hog_detector_kernel(std::make_unique()), _detection_windows(nullptr), _num_detection_windows() -{ -} - -CLHOGDetector::~CLHOGDetector() = default; - -void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class); -} - -void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, - float threshold, size_t idx_class) -{ - _detection_windows = detection_windows; - - // Allocate buffer for storing the number of detected objects - _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int)); - - // Configure HOGDetectorKernel - _hog_detector_kernel->configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class); -} - -void CLHOGDetector::run() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - // Reset number of detections - const unsigned int init_num_detection_windows = _detection_windows->num_values(); - q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows); - - // Run CLHOGDetectorKernel - CLScheduler::get().enqueue(*_hog_detector_kernel); - - // Read number of detections - unsigned int num_detection_windows = 0; - q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows); - - // Update the number of values stored in _detection_windows - _detection_windows->resize(static_cast(num_detection_windows)); - - q.flush(); -} \ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp deleted file mode 100644 index f3aa527417..0000000000 --- a/src/runtime/CL/functions/CLHOGGradient.cpp +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHOGGradient.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" - -using namespace arm_compute; - -CLHOGGradient::CLHOGGradient(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), - _derivative(), - _mag_phase(std::make_unique()), - _gx(), - _gy() -{ -} - -void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_magnitude, output_phase, phase_type, border_mode, constant_border_value); -} - -void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, - uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8); - - const TensorShape &shape_img = input->info()->tensor_shape(); - - // Allocate image memory - TensorInfo info(shape_img, Format::S16); - _gx.allocator()->init(info); - _gy.allocator()->init(info); - - // Manage intermediate buffers - _memory_group.manage(&_gx); - _memory_group.manage(&_gy); - - // Initialise derivate kernel - _derivative.configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - - // Initialise magnitude/phase kernel - if(PhaseType::UNSIGNED == phase_type) - { - _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED); - } - else - { - _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED); - } - - // Allocate intermediate tensors - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); -} - -void CLHOGGradient::run() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Run derivative - _derivative.run(); - - // Run magnitude/phase kernel - CLScheduler::get().enqueue(*_mag_phase); -} \ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp deleted file mode 100644 index 2464e6cf9f..0000000000 --- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h" - -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/CL/CLArray.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/Scheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLHOGDescriptorKernel.h" -#include "src/core/CL/kernels/CLHOGDetectorKernel.h" -#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" - -using namespace arm_compute; - -CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _gradient_kernel(), - _orient_bin_kernel(), - _block_norm_kernel(), - _hog_detect_kernel(), - _non_maxima_kernel(), - _hog_space(), - _hog_norm_space(), - _detection_windows(), - _mag(), - _phase(), - _non_maxima_suppression(false), - _num_orient_bin_kernel(0), - _num_block_norm_kernel(0), - _num_hog_detect_kernel(0) -{ -} - -CLHOGMultiDetection::~CLHOGMultiDetection() = default; - -void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode, - uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, multi_hog, detection_windows, detection_window_strides, border_mode, constant_border_value, threshold, non_maxima_suppression, - min_distance); -} - -void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, - ICLSize2DArray *detection_window_strides, BorderMode border_mode, - uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog); - ARM_COMPUTE_ERROR_ON(nullptr == detection_windows); - ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models()); - - const size_t width = input->info()->dimension(Window::DimX); - const size_t height = input->info()->dimension(Window::DimY); - const TensorShape &shape_img = input->info()->tensor_shape(); - const size_t num_models = multi_hog->num_models(); - PhaseType phase_type = multi_hog->model(0)->info()->phase_type(); - - size_t prev_num_bins = multi_hog->model(0)->info()->num_bins(); - Size2D prev_cell_size = multi_hog->model(0)->info()->cell_size(); - Size2D prev_block_size = multi_hog->model(0)->info()->block_size(); - Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride(); - - /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object - * - * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change. - * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th - * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change. - * Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th - * - * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel - * with "input_orient_bin", "input_hog_detect" and "input_block_norm" - */ - std::vector input_orient_bin; - std::vector input_hog_detect; - std::vector> input_block_norm; - - input_orient_bin.push_back(0); - input_hog_detect.push_back(0); - input_block_norm.emplace_back(0, 0); - - for(size_t i = 1; i < num_models; ++i) - { - size_t cur_num_bins = multi_hog->model(i)->info()->num_bins(); - Size2D cur_cell_size = multi_hog->model(i)->info()->cell_size(); - Size2D cur_block_size = multi_hog->model(i)->info()->block_size(); - Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride(); - - if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height)) - { - prev_num_bins = cur_num_bins; - prev_cell_size = cur_cell_size; - prev_block_size = cur_block_size; - prev_block_stride = cur_block_stride; - - // Compute orientation binning and block normalization kernels. Update input to process - input_orient_bin.push_back(i); - input_block_norm.emplace_back(i, input_orient_bin.size() - 1); - } - else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width) - || (cur_block_stride.height != prev_block_stride.height)) - { - prev_block_size = cur_block_size; - prev_block_stride = cur_block_stride; - - // Compute block normalization kernel. Update input to process - input_block_norm.emplace_back(i, input_orient_bin.size() - 1); - } - - // Update input to process for hog detector kernel - input_hog_detect.push_back(input_block_norm.size() - 1); - } - - _detection_windows = detection_windows; - _non_maxima_suppression = non_maxima_suppression; - _num_orient_bin_kernel = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute - _num_block_norm_kernel = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute - _num_hog_detect_kernel = input_hog_detect.size(); // Number of CLHOGDetector functions to compute - - _orient_bin_kernel.reserve(_num_orient_bin_kernel); - _block_norm_kernel.reserve(_num_block_norm_kernel); - _hog_detect_kernel.resize(_num_hog_detect_kernel); - _hog_space.resize(_num_orient_bin_kernel); - _hog_norm_space.resize(_num_block_norm_kernel); - - // Allocate tensors for magnitude and phase - TensorInfo info_mag(shape_img, Format::S16); - _mag.allocator()->init(info_mag); - - TensorInfo info_phase(shape_img, Format::U8); - _phase.allocator()->init(info_phase); - - // Manage intermediate buffers - _memory_group.manage(&_mag); - _memory_group.manage(&_phase); - - // Initialise gradient kernel - _gradient_kernel.configure(compile_context, input, &_mag, &_phase, phase_type, border_mode, constant_border_value); - - // Configure NETensor for the HOG space and orientation binning kernel - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - const size_t idx_multi_hog = input_orient_bin[i]; - - // Get the corresponding cell size and number of bins - const Size2D &cell = multi_hog->model(idx_multi_hog)->info()->cell_size(); - const size_t num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins(); - - // Calculate number of cells along the x and y directions for the hog_space - const size_t num_cells_x = width / cell.width; - const size_t num_cells_y = height / cell.height; - - // TensorShape of hog space - TensorShape shape_hog_space = input->info()->tensor_shape(); - shape_hog_space.set(Window::DimX, num_cells_x); - shape_hog_space.set(Window::DimY, num_cells_y); - - // Allocate HOG space - TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); - _hog_space[i].allocator()->init(info_space); - - // Manage intermediate buffers - _memory_group.manage(&_hog_space[i]); - - // Initialise orientation binning kernel - _orient_bin_kernel.emplace_back(std::make_unique()); - _orient_bin_kernel.back()->configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info()); - } - - // Allocate intermediate tensors - _mag.allocator()->allocate(); - _phase.allocator()->allocate(); - - // Configure CLTensor for the normalized HOG space and block normalization kernel - for(size_t i = 0; i < _num_block_norm_kernel; ++i) - { - const size_t idx_multi_hog = input_block_norm[i].first; - const size_t idx_orient_bin = input_block_norm[i].second; - - // Allocate normalized HOG space - TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height); - _hog_norm_space[i].allocator()->init(tensor_info); - - // Manage intermediate buffers - _memory_group.manage(&_hog_norm_space[i]); - - // Initialize block normalization kernel - _block_norm_kernel.emplace_back(std::make_unique()); - _block_norm_kernel.back()->configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info()); - } - - // Allocate intermediate tensors - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - _hog_space[i].allocator()->allocate(); - } - - detection_window_strides->map(CLScheduler::get().queue(), true); - - // Configure HOG detector kernel - for(size_t i = 0; i < _num_hog_detect_kernel; ++i) - { - const size_t idx_block_norm = input_hog_detect[i]; - - _hog_detect_kernel[i].configure(compile_context, &_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i); - } - - detection_window_strides->unmap(CLScheduler::get().queue()); - - // Configure non maxima suppression kernel - _non_maxima_kernel.configure(_detection_windows, min_distance); - - // Allocate intermediate tensors - for(size_t i = 0; i < _num_block_norm_kernel; ++i) - { - _hog_norm_space[i].allocator()->allocate(); - } -} - -void CLHOGMultiDetection::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Reset detection window - _detection_windows->clear(); - - // Run gradient - _gradient_kernel.run(); - - // Run orientation binning kernel - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - CLScheduler::get().enqueue(*_orient_bin_kernel[i], false); - } - - // Run block normalization kernel - for(size_t i = 0; i < _num_block_norm_kernel; ++i) - { - CLScheduler::get().enqueue(*_block_norm_kernel[i], false); - } - - // Run HOG detector kernel - for(size_t i = 0; i < _num_hog_detect_kernel; ++i) - { - _hog_detect_kernel[i].run(); - } - - // Run non-maxima suppression kernel if enabled - if(_non_maxima_suppression) - { - // Map detection windows array before computing non maxima suppression - _detection_windows->map(CLScheduler::get().queue(), true); - Scheduler::get().schedule(&_non_maxima_kernel, Window::DimY); - _detection_windows->unmap(CLScheduler::get().queue()); - } -} diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp deleted file mode 100644 index 37f428c677..0000000000 --- a/src/runtime/CL/functions/CLHarrisCorners.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h" - -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/functions/CLSobel3x3.h" -#include "arm_compute/runtime/CL/functions/CLSobel5x5.h" -#include "arm_compute/runtime/CL/functions/CLSobel7x7.h" -#include "arm_compute/runtime/ITensorAllocator.h" -#include "arm_compute/runtime/Scheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLHarrisCornersKernel.h" -#include "src/core/CL/kernels/CLSobel5x5Kernel.h" -#include "src/core/CL/kernels/CLSobel7x7Kernel.h" - -#include -#include - -using namespace arm_compute; - -CLHarrisCorners::CLHarrisCorners(std::shared_ptr memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _sobel(nullptr), - _harris_score(std::make_unique()), - _non_max_suppr(), - _candidates(), - _sort_euclidean(), - _border_gx(std::make_unique()), - _border_gy(std::make_unique()), - _gx(), - _gy(), - _score(), - _nonmax(), - _corners_list(), - _num_corner_candidates(0), - _corners(nullptr) -{ -} - -CLHarrisCorners::~CLHarrisCorners() = default; - -void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist, - float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners, - BorderMode border_mode, uint8_t constant_border_value, bool use_fp16) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, threshold, min_dist, sensitivity, gradient_size, block_size, corners, border_mode, constant_border_value, use_fp16); -} - -void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist, - float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners, - BorderMode border_mode, uint8_t constant_border_value, bool use_fp16) -{ - ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support - ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7)); - ARM_COMPUTE_ERROR_ON(nullptr == corners); - - _corners = corners; - - const TensorShape shape = input->info()->tensor_shape(); - const DataType dt = (gradient_size < 7) ? DataType::S16 : DataType::S32; - TensorInfo tensor_info(shape, 1, dt); - - _gx.allocator()->init(tensor_info); - _gy.allocator()->init(tensor_info); - - TensorInfo info_f32(shape, 1, DataType::F32); - _score.allocator()->init(info_f32); - _nonmax.allocator()->init(info_f32); - - _corners_list.resize(shape.x() * shape.y()); - - // Manage intermediate buffers - _memory_group.manage(&_gx); - _memory_group.manage(&_gy); - - /* Set/init Sobel kernel accordingly with gradient_size */ - switch(gradient_size) - { - case 3: - { - auto k = std::make_unique(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - break; - } - case 5: - { - auto k = std::make_unique(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - break; - } - case 7: - { - auto k = std::make_unique(); - k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value); - _sobel = std::move(k); - break; - } - default: - ARM_COMPUTE_ERROR("Gradient size not implemented"); - } - - // Normalization factor - const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size); - const float pow4_normalization_factor = pow(norm_factor, 4); - - // Manage intermediate buffers - _memory_group.manage(&_score); - - // Set/init Harris Score kernel accordingly with block_size - _harris_score->configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED); - - // Configure border filling using harris score kernel's block size - _border_gx->configure(compile_context, &_gx, _harris_score->border_size(), border_mode, PixelValue(constant_border_value)); - _border_gy->configure(compile_context, &_gy, _harris_score->border_size(), border_mode, PixelValue(constant_border_value)); - - // Allocate intermediate buffers - _gx.allocator()->allocate(); - _gy.allocator()->allocate(); - - // Manage intermediate buffers - _memory_group.manage(&_nonmax); - - // Init non-maxima suppression function - _non_max_suppr.configure(compile_context, &_score, &_nonmax, border_mode); - - // Allocate intermediate buffers - _score.allocator()->allocate(); - - // Init corner candidates kernel - _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates); - - // Allocate intermediate buffers - _nonmax.allocator()->allocate(); - - // Init euclidean distance - _sort_euclidean.configure(_corners_list.data(), _corners, &_num_corner_candidates, min_dist); -} - -void CLHarrisCorners::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - // Init to 0 number of corner candidates - _num_corner_candidates = 0; - - // Run Sobel kernel - _sobel->run(); - - // Fill border before harris score kernel - CLScheduler::get().enqueue(*_border_gx, false); - CLScheduler::get().enqueue(*_border_gy, false); - - // Run harris score kernel - CLScheduler::get().enqueue(*_harris_score, false); - - // Run non-maxima suppression - _non_max_suppr.run(); - - // Run corner candidate kernel - _nonmax.map(true); - Scheduler::get().schedule(&_candidates, Window::DimY); - _nonmax.unmap(); - - _corners->map(CLScheduler::get().queue(), true); - Scheduler::get().schedule(&_sort_euclidean, Window::DimY); - _corners->unmap(CLScheduler::get().queue()); -} diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp deleted file mode 100644 index f278cf0dc2..0000000000 --- a/src/runtime/CL/functions/CLHistogram.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLHistogram.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" - -using namespace arm_compute; - -CLHistogram::CLHistogram() - : _kernel(), _kernel_border() -{ -} - -void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output) -{ - _kernel.configure(compile_context, input, output); - _kernel_border.configure(compile_context, input, output); -} - -void CLHistogram::run() -{ - CLScheduler::get().enqueue(_kernel, false); - CLScheduler::get().enqueue(_kernel_border); -} diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp deleted file mode 100644 index 56a151a085..0000000000 --- a/src/runtime/CL/functions/CLIntegralImage.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLIntegralImage.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLIntegralImageKernel.h" - -using namespace arm_compute; - -CLIntegralImage::CLIntegralImage() - : _integral_hor(std::make_unique()), - _integral_vert(std::make_unique()) -{ -} - -CLIntegralImage::~CLIntegralImage() = default; - -void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - _integral_hor->configure(compile_context, input, output); - _integral_vert->configure(compile_context, output); -} - -void CLIntegralImage::run() -{ - CLScheduler::get().enqueue(*_integral_hor, false); - CLScheduler::get().enqueue(*_integral_vert); -} diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp deleted file mode 100644 index 1ad19e56ea..0000000000 --- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/IPyramid.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h" -#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" -#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" -#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLGaussian5x5Kernel.h" -#include "src/core/CL/kernels/CLGaussianPyramidKernel.h" - -using namespace arm_compute; - -CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT - : _num_levels(0), - _gaussian_pyr_function(), - _convf(), - _subf(), - _depth_function(), - _gauss_pyr(), - _conv_pyr() -{ -} - -void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, output, border_mode, constant_border_value); -} - -void CLLaplacianPyramid::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1)); - - _num_levels = pyramid->info()->num_levels(); - - // Create and initialize the gaussian pyramid and the convoluted pyramid - PyramidInfo pyramid_info; - pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8); - - _gauss_pyr.init(pyramid_info); - _conv_pyr.init(pyramid_info); - - // Create Gaussian Pyramid function - _gaussian_pyr_function.configure(compile_context, input, &_gauss_pyr, border_mode, constant_border_value); - - _convf.resize(_num_levels); - _subf.resize(_num_levels); - - for(unsigned int i = 0; i < _num_levels; ++i) - { - _convf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value); - _subf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP); - } - - _depth_function.configure(compile_context, _conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0); - - _gauss_pyr.allocate(); - _conv_pyr.allocate(); -} - -void CLLaplacianPyramid::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function"); - - _gaussian_pyr_function.run(); // compute gaussian pyramid - - for(unsigned int i = 0; i < _num_levels; ++i) - { - _convf[i].run(); // convolute gaussian pyramid - } - - for(unsigned int i = 0; i < _num_levels; ++i) - { - _subf[i].run(); // compute laplacian image - } - - _depth_function.run(); -} diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp deleted file mode 100644 index d7fd81754b..0000000000 --- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/IPyramid.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" - -#include - -using namespace arm_compute; - -CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT - : _tmp_pyr(), - _addf(), - _scalef(), - _depthf() -{ -} - -void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), pyramid, input, output, border_mode, constant_border_value); -} - -void CLLaplacianReconstruct::configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(nullptr == pyramid); - ARM_COMPUTE_ERROR_ON(input == output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions()); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0)); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1)); - - const size_t num_levels = pyramid->info()->num_levels(); - - // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) ) - PyramidInfo pyramid_info; - pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16); - _tmp_pyr.init(pyramid_info); - - // Allocate add and scale functions. Level 0 does not need to be scaled. - _addf.resize(num_levels); - _scalef.resize(num_levels - 1); - - const size_t last_level = num_levels - 1; - - _addf[last_level].configure(compile_context, input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE); - - // Scale levels n-1 to 1, and add levels n-2 to 0 - for(size_t l = 0; l < last_level; ++l) - { - _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value }); - _addf[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE); - } - - // Convert level 0 from S16 to U8 - _depthf.configure(compile_context, _tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0); - - _tmp_pyr.allocate(); -} - -void CLLaplacianReconstruct::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function"); - - const size_t last_level = _tmp_pyr.info()->num_levels() - 1; - - _addf[last_level].run(); - - // Run l = [last_level - 1, 0] - for(size_t l = last_level; l-- > 0;) - { - _scalef[l].run(); - _addf[l].run(); - } - - _depthf.run(); -} diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp deleted file mode 100644 index 0599a11fa1..0000000000 --- a/src/runtime/CL/functions/CLMagnitude.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLMagnitude.h" - -#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" - -#include - -using namespace arm_compute; - -void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, mag_type); -} - -void CLMagnitude::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type) -{ - auto k = std::make_unique(); - k->configure(compile_context, input1, input2, output, nullptr, mag_type); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp deleted file mode 100644 index d8cd41d45f..0000000000 --- a/src/runtime/CL/functions/CLMeanStdDev.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/core/TensorInfo.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLMeanStdDevKernel.h" -#include "src/core/CL/kernels/CLReductionOperationKernel.h" - -using namespace arm_compute; - -CLMeanStdDev::CLMeanStdDev(std::shared_ptr memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _data_type(), - _num_pixels(), - _run_stddev(), - _reduction_operation_mean(), - _reduction_operation_stddev(), - _reduction_output_mean(), - _reduction_output_stddev(), - _mean(nullptr), - _stddev(nullptr), - _mean_stddev_kernel(std::make_unique()), - _fill_border_kernel(std::make_unique()), - _global_sum(), - _global_sum_squared() -{ -} - -CLMeanStdDev::~CLMeanStdDev() = default; - -Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev) -{ - ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input); - if(is_data_type_float(input->data_type())) - { - ARM_COMPUTE_UNUSED(mean); - ARM_COMPUTE_UNUSED(stddev); - - TensorShape output_shape = TensorShape{ 1, input->dimension(1) }; - TensorInfo output_shape_info = TensorInfo(output_shape, 1, DataType::U8); - return CLReductionOperation::validate(input, &output_shape_info, 0, ReductionOperation::SUM); - } - else - { - return CLMeanStdDevKernel::validate(input, mean, nullptr, stddev, nullptr); - } -} - -void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, mean, stddev); -} - -void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev) -{ - // In the case of F16/F32 we call reduction operation for calculating CLMeanStdDev - _data_type = input->info()->data_type(); - - if(is_data_type_float(_data_type)) - { - _num_pixels = input->info()->dimension(0) * input->info()->dimension(1); - - _memory_group.manage(&_reduction_output_mean); - _reduction_operation_mean.configure(compile_context, input, &_reduction_output_mean, 0, ReductionOperation::SUM); - _reduction_output_mean.allocator()->allocate(); - _mean = mean; - - if(stddev != nullptr) - { - _memory_group.manage(&_reduction_output_stddev); - _reduction_operation_stddev.configure(compile_context, input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE); - _reduction_output_stddev.allocator()->allocate(); - _stddev = stddev; - _run_stddev = true; - } - } - else - { - _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong)); - - if(stddev != nullptr) - { - _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong)); - } - - _mean_stddev_kernel->configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared); - _fill_border_kernel->configure(compile_context, input, _mean_stddev_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0))); - } -} - -template -void CLMeanStdDev::run_float() -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - // Perform reduction on x-axis - _reduction_operation_mean.run(); - if(_run_stddev) - { - _reduction_operation_stddev.run(); - _reduction_output_stddev.map(true); - } - - _reduction_output_mean.map(true); - - auto mean = static_cast(0); - - // Calculate final result for mean - for(unsigned int i = 0; i < _reduction_output_mean.info()->dimension(1); ++i) - { - mean += *reinterpret_cast(_reduction_output_mean.buffer() + _reduction_output_mean.info()->offset_element_in_bytes(Coordinates(0, i))); - } - - mean /= _num_pixels; - *_mean = mean; - - if(_run_stddev) - { - auto stddev = static_cast(0); - // Calculate final result for stddev - for(unsigned int i = 0; i < _reduction_output_stddev.info()->dimension(1); ++i) - { - stddev += *reinterpret_cast(_reduction_output_stddev.buffer() + _reduction_output_stddev.info()->offset_element_in_bytes(Coordinates(0, i))); - } - *_stddev = std::sqrt((stddev / _num_pixels) - (mean * mean)); - - _reduction_output_stddev.unmap(); - } - _reduction_output_mean.unmap(); -} - -void CLMeanStdDev::run_int() -{ - CLScheduler::get().enqueue(*_fill_border_kernel); - CLScheduler::get().enqueue(*_mean_stddev_kernel); -} - -void CLMeanStdDev::run() -{ - switch(_data_type) - { - case DataType::F16: - run_float(); - break; - case DataType::F32: - run_float(); - break; - case DataType::U8: - run_int(); - break; - default: - ARM_COMPUTE_ERROR_ON("Not supported"); - } -} diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp deleted file mode 100644 index b32063a8fe..0000000000 --- a/src/runtime/CL/functions/CLMedian3x3.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLMedian3x3.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLMedian3x3Kernel.h" - -#include - -using namespace arm_compute; - -void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value); -} - -void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp deleted file mode 100644 index ace6a1cb21..0000000000 --- a/src/runtime/CL/functions/CLMinMaxLocation.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h" -#include "arm_compute/core/CL/CLHelpers.h" -#include "src/core/CL/kernels/CLMinMaxLocationKernel.h" - -namespace arm_compute -{ -CLMinMaxLocation::CLMinMaxLocation() - : _min_max_kernel(std::make_unique()), - _min_max_loc_kernel(std::make_unique()), - _min_max_vals(), - _min_max_count_vals(), - _min(nullptr), - _max(nullptr), - _min_count(nullptr), - _max_count(nullptr), - _min_loc(nullptr), - _max_loc(nullptr) -{ -} - -CLMinMaxLocation::~CLMinMaxLocation() = default; - -void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count); -} - -void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, - uint32_t *min_count, - uint32_t *max_count) -{ - ARM_COMPUTE_ERROR_ON(nullptr == min); - ARM_COMPUTE_ERROR_ON(nullptr == max); - - _min_max_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(int32_t)); - _min_max_count_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(uint32_t)); - _min = min; - _max = max; - _min_count = min_count; - _max_count = max_count; - _min_loc = min_loc; - _max_loc = max_loc; - - _min_max_kernel->configure(compile_context, input, &_min_max_vals); - _min_max_loc_kernel->configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc); -} - -void CLMinMaxLocation::run() -{ - cl::CommandQueue q = CLScheduler::get().queue(); - - CLScheduler::get().enqueue(*_min_max_kernel, false); - CLScheduler::get().enqueue(*_min_max_loc_kernel, false); - - // Update min and max - q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast(_min)); - q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), static_cast(_max)); - - // Update min and max count - if(_min_count != nullptr) - { - q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 0 * sizeof(uint32_t), sizeof(uint32_t), _min_count); - } - if(_max_count != nullptr) - { - q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 1 * sizeof(uint32_t), sizeof(uint32_t), _max_count); - } - - // Update min/max point arrays (Makes the kernel blocking) - if(_min_loc != nullptr) - { - unsigned int min_count = 0; - q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 0 * sizeof(uint32_t), sizeof(uint32_t), &min_count); - size_t min_corner_size = std::min(static_cast(min_count), _min_loc->max_num_values()); - _min_loc->resize(min_corner_size); - } - if(_max_loc != nullptr) - { - unsigned int max_count = 0; - q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 1 * sizeof(uint32_t), sizeof(uint32_t), &max_count); - size_t max_corner_size = std::min(static_cast(max_count), _max_loc->max_num_values()); - _max_loc->resize(max_corner_size); - } -} -} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp deleted file mode 100644 index ec88f879b7..0000000000 --- a/src/runtime/CL/functions/CLNonLinearFilter.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h" - -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLNonLinearFilterKernel.h" - -#include - -using namespace arm_compute; - -void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, - BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_mode, constant_border_value); -} - -void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, - const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp deleted file mode 100644 index 5906ea5a4b..0000000000 --- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h" - -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h" - -#include - -using namespace arm_compute; - -void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode); -} - -void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - - if(border_mode != BorderMode::UNDEFINED) - { - _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT); - } - else - { - _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED); - } -} diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp deleted file mode 100644 index 76e0ac5f0b..0000000000 --- a/src/runtime/CL/functions/CLOpticalFlow.cpp +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/runtime/CL/CLPyramid.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/CLTensorAllocator.h" -#include "arm_compute/runtime/CL/functions/CLScharr3x3.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLLKTrackerKernel.h" - -using namespace arm_compute; - -CLOpticalFlow::CLOpticalFlow(std::shared_ptr memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _tracker_init_kernel(), - _tracker_stage0_kernel(), - _tracker_stage1_kernel(), - _tracker_finalize_kernel(std::make_unique()), - _func_scharr(), - _scharr_gx(), - _scharr_gy(), - _old_points(nullptr), - _new_points_estimates(nullptr), - _new_points(nullptr), - _old_points_internal(), - _new_points_internal(), - _coefficient_table(), - _old_values(), - _num_levels(0) -{ -} - -CLOpticalFlow::~CLOpticalFlow() = default; - -void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid, - const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points, - Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate, - BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), old_pyramid, new_pyramid, old_points, new_points_estimates, new_points, termination, epsilon, num_iterations, window_dimension, - use_initial_estimate, border_mode, constant_border_value); -} - -void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid, - const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points, - Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate, - BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid); - ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid); - ARM_COMPUTE_ERROR_ON(nullptr == old_points); - ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates); - ARM_COMPUTE_ERROR_ON(nullptr == new_points); - ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels()); - ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels()); - ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width()); - ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height()); - ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values()); - - // Set member variables - _old_points = old_points; - _new_points_estimates = new_points_estimates; - _new_points = new_points; - _num_levels = old_pyramid->info()->num_levels(); - - const float pyr_scale = old_pyramid->info()->scale(); - const int list_length = old_points->num_values(); - const int old_values_list_length = list_length * window_dimension * window_dimension; - - // Create kernels and tensors - _tracker_init_kernel.reserve(_num_levels); - _tracker_stage0_kernel.reserve(_num_levels); - _tracker_stage1_kernel.reserve(_num_levels); - _func_scharr.resize(_num_levels); - _scharr_gx.resize(_num_levels); - _scharr_gy.resize(_num_levels); - - // Create internal keypoint arrays - _old_points_internal = std::make_unique(list_length); - _old_points_internal->resize(list_length); - _new_points_internal = std::make_unique(list_length); - _new_points_internal->resize(list_length); - _coefficient_table = std::make_unique(list_length); - _coefficient_table->resize(list_length); - _old_values = std::make_unique(old_values_list_length); - _old_values->resize(old_values_list_length); - _new_points->resize(list_length); - - for(size_t i = 0; i < _num_levels; ++i) - { - // Get images from the ith level of old and right pyramid - ICLImage *old_ith_input = old_pyramid->get_pyramid_level(i); - ICLImage *new_ith_input = new_pyramid->get_pyramid_level(i); - - // Get width and height of images - const unsigned int width_ith = old_ith_input->info()->dimension(0); - const unsigned int height_ith = new_ith_input->info()->dimension(1); - - // Initialize Scharr tensors - TensorInfo tensor_info(TensorShape(width_ith, height_ith), 1, DataType::S16); - _scharr_gx[i].allocator()->init(tensor_info); - _scharr_gy[i].allocator()->init(tensor_info); - - // Manage intermediate buffers - _memory_group.manage(&_scharr_gx[i]); - _memory_group.manage(&_scharr_gy[i]); - - // Init Scharr kernel - _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value); - - // Init Lucas-Kanade init kernel - _tracker_init_kernel.emplace_back(std::make_unique()); - _tracker_init_kernel.back()->configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale); - - // Init Lucas-Kanade stage0 kernel - _tracker_stage0_kernel.emplace_back(std::make_unique()); - _tracker_stage0_kernel.back()->configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], - _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), - window_dimension, i); - - // Init Lucas-Kanade stage1 kernel - _tracker_stage1_kernel.emplace_back(std::make_unique()); - _tracker_stage1_kernel.back()->configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(), - termination, epsilon, num_iterations, window_dimension, i); - - // Allocate intermediate buffers - _scharr_gx[i].allocator()->allocate(); - _scharr_gy[i].allocator()->allocate(); - } - - // Finalize Lucas-Kanade - _tracker_finalize_kernel->configure(compile_context, _new_points_internal.get(), new_points); -} - -void CLOpticalFlow::run() -{ - ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function"); - - MemoryGroupResourceScope scope_mg(_memory_group); - - for(unsigned int level = _num_levels; level > 0; --level) - { - // Run Scharr kernel - _func_scharr[level - 1].run(); - - // Run Lucas-Kanade init kernel - CLScheduler::get().enqueue(*_tracker_init_kernel[level - 1]); - - // Run Lucas-Kanade stage0 kernel - CLScheduler::get().enqueue(*_tracker_stage0_kernel[level - 1]); - - // Run Lucas-Kanade stage1 kernel - CLScheduler::get().enqueue(*_tracker_stage1_kernel[level - 1]); - } - - CLScheduler::get().enqueue(*_tracker_finalize_kernel, true); -} diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp deleted file mode 100644 index b2ff5d05ca..0000000000 --- a/src/runtime/CL/functions/CLPhase.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLPhase.h" - -#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h" - -#include - -using namespace arm_compute; - -void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, phase_type); -} - -void CLPhase::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type) -{ - auto k = std::make_unique(); - k->configure(compile_context, input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp deleted file mode 100644 index 563ec19266..0000000000 --- a/src/runtime/CL/functions/CLScharr3x3.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLScharr3x3.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLScharr3x3Kernel.h" - -#include - -using namespace arm_compute; - -void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp deleted file mode 100644 index 6724c12a72..0000000000 --- a/src/runtime/CL/functions/CLSobel3x3.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLSobel3x3.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLSobel3x3Kernel.h" - -#include - -using namespace arm_compute; - -CLSobel3x3::~CLSobel3x3() = default; - -void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp deleted file mode 100644 index 98f215794c..0000000000 --- a/src/runtime/CL/functions/CLSobel5x5.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLSobel5x5.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLSobel5x5Kernel.h" - -using namespace arm_compute; - -CLSobel5x5::CLSobel5x5(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), - _sobel_hor(std::make_unique()), - _sobel_vert(std::make_unique()), - _border_handler(std::make_unique()), - _tmp_x(), - _tmp_y() -{ -} - -CLSobel5x5::~CLSobel5x5() = default; - -void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - const bool run_sobel_x = output_x != nullptr; - const bool run_sobel_y = output_y != nullptr; - - TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16); - - if(run_sobel_x && run_sobel_y) - { - _tmp_x.allocator()->init(tensor_info); - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _memory_group.manage(&_tmp_y); - _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - _tmp_y.allocator()->allocate(); - } - else if(run_sobel_x) - { - _tmp_x.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - } - else if(run_sobel_y) - { - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_y); - _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_y.allocator()->allocate(); - } - _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value)); -} - -void CLSobel5x5::run() -{ - CLScheduler::get().enqueue(*_border_handler, false); - - MemoryGroupResourceScope scope_mg(_memory_group); - - CLScheduler::get().enqueue(*_sobel_hor, false); - CLScheduler::get().enqueue(*_sobel_vert); -} diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp deleted file mode 100644 index a3d63f98dd..0000000000 --- a/src/runtime/CL/functions/CLSobel7x7.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLSobel7x7.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/ITensorAllocator.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLSobel7x7Kernel.h" - -using namespace arm_compute; - -CLSobel7x7::CLSobel7x7(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), - _sobel_hor(std::make_unique()), - _sobel_vert(std::make_unique()), - _border_handler(std::make_unique()), - _tmp_x(), - _tmp_y() -{ -} - -CLSobel7x7::~CLSobel7x7() = default; - -void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value); -} - -void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - - const bool run_sobel_x = output_x != nullptr; - const bool run_sobel_y = output_y != nullptr; - - TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S32); - - if(run_sobel_x && run_sobel_y) - { - _tmp_x.allocator()->init(tensor_info); - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _memory_group.manage(&_tmp_y); - _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - _tmp_y.allocator()->allocate(); - } - else if(run_sobel_x) - { - _tmp_x.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_x); - _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED); - _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED); - _tmp_x.allocator()->allocate(); - } - else if(run_sobel_y) - { - _tmp_y.allocator()->init(tensor_info); - _memory_group.manage(&_tmp_y); - _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED); - _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED); - _tmp_y.allocator()->allocate(); - } - _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value)); -} - -void CLSobel7x7::run() -{ - CLScheduler::get().enqueue(*_border_handler, false); - - MemoryGroupResourceScope scope_mg(_memory_group); - - CLScheduler::get().enqueue(*_sobel_hor, false); - CLScheduler::get().enqueue(*_sobel_vert); -} diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp deleted file mode 100644 index a4671f51bd..0000000000 --- a/src/runtime/CL/functions/CLTableLookup.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLTableLookup.h" - -#include "src/core/CL/kernels/CLTableLookupKernel.h" - -#include - -using namespace arm_compute; - -void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, lut, output); -} - -void CLTableLookup::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, lut, output); - _kernel = std::move(k); -} diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp deleted file mode 100644 index 70bc3b9365..0000000000 --- a/src/runtime/CL/functions/CLThreshold.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLThreshold.h" - -#include "src/core/CL/kernels/CLThresholdKernel.h" - -#include - -namespace arm_compute -{ -void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, info); -} - -void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, info); - _kernel = std::move(k); -} -} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp deleted file mode 100644 index 9a22446cf6..0000000000 --- a/src/runtime/CL/functions/CLWarpAffine.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLWarpAffine.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLWarpAffineKernel.h" - -#include - -using namespace arm_compute; - -void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value); -} - -void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, - uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, matrix, policy); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp deleted file mode 100644 index 0ec6b42e75..0000000000 --- a/src/runtime/CL/functions/CLWarpPerspective.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h" - -#include "arm_compute/core/PixelValue.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h" - -#include - -using namespace arm_compute; - -void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value); -} - -void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array &matrix, InterpolationPolicy policy, BorderMode border_mode, - uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(compile_context, input, output, matrix, policy); - _kernel = std::move(k); - _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); -} diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp deleted file mode 100644 index ad62a2254a..0000000000 --- a/src/runtime/NEON/functions/NEConvolution.cpp +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NEConvolution.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "src/core/NEON/kernels/NEConvolutionKernel.h" -#include "src/core/NEON/kernels/NEConvolutionKernel.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" - -#include -#include - -namespace arm_compute -{ -NEConvolution3x3::~NEConvolution3x3() = default; - -void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) -{ - auto k = std::make_unique(); - k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - - auto b = std::make_unique(); - b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); - _border_handler = std::move(b); -} - -template -NEConvolutionSquare::~NEConvolutionSquare() = default; - -template -NEConvolutionSquare::NEConvolutionSquare(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler() -{ -} - -template -void NEConvolutionSquare::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, - uint8_t constant_border_value) -{ - ARM_COMPUTE_ERROR_ON(conv == nullptr); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - - std::array conv_col{ { 0 } }; - std::array conv_row{ { 0 } }; - - _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size); - - auto b = std::make_unique(); - if(_is_separable) - { - DataType intermediate_type = DataType::UNKNOWN; - std::tie(std::ignore, intermediate_type) = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size); - - _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type)); - - // Manage intermediate buffers - _memory_group.manage(&_tmp); - - // Calculate scale - if(scale == 0) - { - scale = calculate_matrix_scale(conv, matrix_size); - } - - _kernel_hor = std::make_unique>(); - _kernel_vert = std::make_unique>(); - - _kernel_hor->configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED); - _kernel_vert->configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED); - - _tmp.allocator()->allocate(); - - b->configure(input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value)); - } - else - { - _kernel = std::make_unique>(); - _kernel->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED); - b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); - } - _border_handler = std::move(b); -} - -template -void NEConvolutionSquare::run() -{ - NEScheduler::get().schedule(_border_handler.get(), Window::DimZ); - - if(_is_separable) - { - MemoryGroupResourceScope scope_mg(_memory_group); - - NEScheduler::get().schedule(_kernel_hor.get(), Window::DimY); - NEScheduler::get().schedule(_kernel_vert.get(), Window::DimY); - } - else - { - NEScheduler::get().schedule(_kernel.get(), Window::DimY); - } -} - -template class arm_compute::NEConvolutionSquare<5>; -template class arm_compute::NEConvolutionSquare<7>; -template class arm_compute::NEConvolutionSquare<9>; - -NEConvolutionRectangle::~NEConvolutionRectangle() = default; - -void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value) -{ - border_mode = (border_mode == BorderMode::UNDEFINED) ? BorderMode::CONSTANT : border_mode; - auto k = std::make_unique(); - k->configure(input, output, conv, rows, cols, scale, false); - _kernel = std::move(k); - - auto b = std::make_unique(); - b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); - _border_handler = std::move(b); -} -} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp deleted file mode 100644 index a34be71ea0..0000000000 --- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2016-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h" - -#include "src/core/NEON/kernels/NEFillBorderKernel.h" -#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h" - -#include - -namespace arm_compute -{ -void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode) -{ - auto k = std::make_unique(); - k->configure(input, output, border_mode == BorderMode::UNDEFINED); - _kernel = std::move(k); - - auto b = std::make_unique(); - if(border_mode != BorderMode::UNDEFINED) - { - b->configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast(0.f)); - } - else - { - b->configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast(0.f)); - } - _border_handler = std::move(b); -} -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp new file mode 100644 index 0000000000..a55f7bc218 --- /dev/null +++ b/src/runtime/NEON/functions/NERemap.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NERemap.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/TensorAllocator.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/core/NEON/kernels/NERemapKernel.h" + +#include + +namespace arm_compute +{ +void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported"); + + auto k = std::make_unique(); + k->configure(input, map_x, map_y, output, policy); + _kernel = std::move(k); + + auto b = std::make_unique(); + b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value)); + _border_handler = std::move(b); +} +} // namespace arm_compute -- cgit v1.2.1