diff options
-rw-r--r-- | arm_compute/core/CL/CLKernels.h | 1 | ||||
-rw-r--r-- | arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h | 86 | ||||
-rw-r--r-- | arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h | 21 | ||||
-rw-r--r-- | arm_compute/runtime/CL/CLFunctions.h | 1 | ||||
-rw-r--r-- | arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h | 65 | ||||
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 5 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/direct_convolution.cl | 227 | ||||
-rw-r--r-- | src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp | 171 | ||||
-rw-r--r-- | src/runtime/CL/functions/CLDirectConvolutionLayer.cpp | 50 | ||||
-rw-r--r-- | tests/dataset/ConvolutionLayerDataset.h | 3 | ||||
-rw-r--r-- | tests/validation/CL/CMakeLists.txt | 1 | ||||
-rw-r--r-- | tests/validation/CL/DirectConvolutionLayer.cpp | 158 | ||||
-rw-r--r-- | tests/validation/NEON/CMakeLists.txt | 2 | ||||
-rw-r--r-- | tests/validation/NEON/DirectConvolutionLayer.cpp (renamed from tests/validation/NEON/ConvolutionLayerDirect.cpp) | 4 |
14 files changed, 781 insertions, 14 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index 0e9f356e52..cf36beffa3 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -46,6 +46,7 @@ #include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h" #include "arm_compute/core/CL/kernels/CLDerivativeKernel.h" #include "arm_compute/core/CL/kernels/CLDilateKernel.h" +#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h" #include "arm_compute/core/CL/kernels/CLErodeKernel.h" #include "arm_compute/core/CL/kernels/CLFastCornersKernel.h" #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h new file mode 100644 index 0000000000..28eecf029a --- /dev/null +++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the direct convolution kernel. + */ +template <unsigned int kernel_size> +class CLDirectConvolutionLayerKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLDirectConvolutionLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDirectConvolutionLayerKernel(const CLDirectConvolutionLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDirectConvolutionLayerKernel &operator=(const CLDirectConvolutionLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + CLDirectConvolutionLayerKernel(CLDirectConvolutionLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + CLDirectConvolutionLayerKernel &operator=(CLDirectConvolutionLayerKernel &&) = default; + /** Default destructor */ + ~CLDirectConvolutionLayerKernel() = default; + /** Set the input, weights, biases and output tensors. + * + * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Biases are 1D tensor with dimension [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); + + // Inherited methods overridden: + BorderSize border_size() const override; + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_biases; + const ICLTensor *_weights; + ICLTensor *_output; + BorderSize _border_size; + int _conv_pad_x; + int _conv_pad_y; + int _conv_stride_x; + int _conv_stride_y; +}; + +using CLDirectConvolutionLayer3x3Kernel = CLDirectConvolutionLayerKernel<3>; +} +#endif /*__ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h index d726071606..5612e1ae62 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h @@ -46,16 +46,17 @@ public: NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default; /** Default destructor */ ~NEDirectConvolutionLayerKernel() = default; - /** Set the input, weights and output tensors. - * - * @param[in] input Input tensor. Data types supported: QS8/F32. - * @param[in] weights Set of kernels to convolve the input volume. - * The 3rd dimension must be the same as the input's volume 3rd dimension. - * Data type supported: Same as @p input. - * @param[out] output Output tensor. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - */ + /** Set the input, weights, and output tensors. + * + * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported:Same as @p input. + * @param[out] output Output tensor. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info); // Inherited methods overridden: diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index 862b2c1c40..82abbe0aac 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -46,6 +46,7 @@ #include "arm_compute/runtime/CL/functions/CLDepthConvert.h" #include "arm_compute/runtime/CL/functions/CLDerivative.h" #include "arm_compute/runtime/CL/functions/CLDilate.h" +#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" #include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h" #include "arm_compute/runtime/CL/functions/CLErode.h" #include "arm_compute/runtime/CL/functions/CLFastCorners.h" diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h new file mode 100644 index 0000000000..8b43e18167 --- /dev/null +++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H__ +#define __ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H__ + +#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h" +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to execute direct convolution function: + */ +class CLDirectConvolutionLayer : public IFunction +{ +public: + CLDirectConvolutionLayer(); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); + + // Inherited methods overridden: + void run() override; + +private: + CLDirectConvolutionLayer3x3Kernel _direct_conv_kernel; + CLFillBorderKernel _input_border_handler; +}; +} +#endif /* __ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H__ */ diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 769d50992d..8f6ec20fc3 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -145,6 +145,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = { "copy_to_keypoint", "fast_corners.cl" }, { "derivative", "derivative.cl" }, { "dilate", "dilate.cl" }, + { "direct_convolution3x3", "direct_convolution.cl" }, { "erode", "erode.cl" }, { "fast_corners", "fast_corners.cl" }, { "fill_image_borders_constant", "fill_border.cl" }, @@ -348,6 +349,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = #include "./cl_kernels/dilate.clembed" }, { + "direct_convolution.cl", +#include "./cl_kernels/direct_convolution.clembed" + }, + { "erode.cl", #include "./cl_kernels/erode.clembed" }, diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/direct_convolution.cl new file mode 100644 index 0000000000..b5524e1d4b --- /dev/null +++ b/src/core/CL/cl_kernels/direct_convolution.cl @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if STRIDE_X == 2 +#define CONVOLVE1x3(left_pixel_position, left_coeff, middle_coeff, right_coeff) convolution1x3_stride2(left_pixel_position, left_coeff, middle_coeff, right_coeff) +#elif STRIDE_X == 1 /* STRIDE_X == 1 */ +#define CONVOLVE1x3(left_pixel_position, left_coeff, middle_coeff, right_coeff) convolution1x3_stride1(left_pixel_position, left_coeff, middle_coeff, right_coeff) +#else /* STRIDE_X not equals 1 or 2 */ +#error "STRIDE_X larger than 2 is not supported" +#endif /* STRIDE_X == 2 */ + +/** Compute a 1D horizontal convolution of size 3 with stride as 1. + * + * @param[in] left_pixel Pointer to the left pixel. + * @param[in] left_coeff Weight of the left pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] right_coeff Weight of the right pixel + * + * @return a convoluted values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3_stride1(__global const DATA_TYPE *left_pixel, + const DATA_TYPE left_coeff, + const DATA_TYPE middle_coeff, + const DATA_TYPE right_coeff) +{ + VEC_DATA_TYPE(DATA_TYPE, 16) + temp = vload16(0, left_pixel); + + VEC_DATA_TYPE(DATA_TYPE, 8) + left = temp.s01234567; + VEC_DATA_TYPE(DATA_TYPE, 8) + middle = temp.s12345678; + VEC_DATA_TYPE(DATA_TYPE, 8) + right = temp.s23456789; + + return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff; +} + +/** Compute a 1D horizontal convolution of size 3 with stride as 2. + * + * @param[in] left_pixel Pointer to the left pixel. + * @param[in] left_coeff Weight of the left pixel + * @param[in] middle_coeff Weight of the middle pixel + * @param[in] right_coeff Weight of the right pixel + * + * @return a convoluted values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3_stride2(__global const DATA_TYPE *left_pixel, + const DATA_TYPE left_coeff, + const DATA_TYPE middle_coeff, + const DATA_TYPE right_coeff) +{ + const int stride_size = 2; + + VEC_DATA_TYPE(DATA_TYPE, 16) + temp1 = vload16(0, left_pixel); + + VEC_DATA_TYPE(DATA_TYPE, 16) + temp2 = vload16(0, left_pixel + 8); + + VEC_DATA_TYPE(DATA_TYPE, 8) + left = (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0246, temp2.s0246); + + VEC_DATA_TYPE(DATA_TYPE, 8) + middle = (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s1357, temp2.s1357); + + VEC_DATA_TYPE(DATA_TYPE, 8) + right = (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s2468, temp2.s2468); + + return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff; +} + +/** Apply a 3x3 2D convolution matrix on the input and return the result. + * + * Convolution matrix layout: + * + * [ mat0, mat1, mat2 ]\n + * [ mat3, mat4, mat5 ]\n + * [ mat6, mat7, mat8 ]\n + * + * @param[in] src A pointer to source Image structure + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat1 Coefficient from the convolution matrix + * @param[in] mat2 Coefficient from the convolution matrix + * @param[in] mat3 Coefficient from the convolution matrix + * @param[in] mat4 Coefficient from the convolution matrix + * @param[in] mat5 Coefficient from the convolution matrix + * @param[in] mat6 Coefficient from the convolution matrix + * @param[in] mat0 Coefficient from the convolution matrix + * @param[in] mat7 Coefficient from the convolution matrix + * @param[in] mat8 Coefficient from the convolution matrix + * + * @return convoluted values. + */ +inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3( + Image *src, + const DATA_TYPE mat0, const DATA_TYPE mat1, const DATA_TYPE mat2, + const DATA_TYPE mat3, const DATA_TYPE mat4, const DATA_TYPE mat5, + const DATA_TYPE mat6, const DATA_TYPE mat7, const DATA_TYPE mat8) +{ + // Output pixels + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels; + + // Row 0 + pixels = CONVOLVE1x3((__global DATA_TYPE *)offset(src, 0, 0), mat0, mat1, mat2); + // Row + pixels += CONVOLVE1x3((__global DATA_TYPE *)offset(src, 0, 1), mat3, mat4, mat5); + // Row 2 + pixels += CONVOLVE1x3((__global DATA_TYPE *)offset(src, 0, 2), mat6, mat7, mat8); + + return pixels; +} + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The convolution stride x and stride y must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1, _DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in W dimension + * @param[in] filter_depth The depth size of the filter + */ +__kernel void direct_convolution3x3( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + TENSOR3D_DECLARATION(weights), +#ifdef HAS_BIAS + VECTOR_DECLARATION(biases), +#endif /* defined(HAS_BIAS) */ + unsigned int weights_stride_w, + unsigned int filter_depth) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef HAS_BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* defined(HAS_BIAS) */ + + VEC_DATA_TYPE(DATA_TYPE, 8) + pixels = 0; + + const uint z_index = get_global_id(2); + + weights.ptr += z_index * weights_stride_w; + + for(int d = 0; d < filter_depth; ++d) + { + VEC_DATA_TYPE(DATA_TYPE, 4) + weights_row1 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0)); + VEC_DATA_TYPE(DATA_TYPE, 4) + weights_row2 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 1, 0)); + VEC_DATA_TYPE(DATA_TYPE, 4) + weights_row3 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 2, 0)); + + pixels += convolution3x3(&src, weights_row1.s0, + weights_row1.s1, + weights_row1.s2, + weights_row2.s0, + weights_row2.s1, + weights_row2.s2, + weights_row3.s0, + weights_row3.s1, + weights_row3.s2); + + src.ptr += src_stride_z; + weights.ptr += weights_stride_z; + } + +#ifdef HAS_BIAS + pixels += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))); +#endif /* defined(HAS_BIAS) */ + + vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr); +} diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp new file mode 100644 index 0000000000..7f9e9d20e1 --- /dev/null +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +template <unsigned int kernel_size> +CLDirectConvolutionLayerKernel<kernel_size>::CLDirectConvolutionLayerKernel() + : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_pad_x(0), _conv_pad_y(0), _conv_stride_x(0), _conv_stride_y(0) +{ +} + +template <unsigned int kernel_size> +BorderSize CLDirectConvolutionLayerKernel<kernel_size>::border_size() const +{ + return _border_size; +} + +template <unsigned int kernel_size> +void CLDirectConvolutionLayerKernel<kernel_size>::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) +{ + static_assert(kernel_size == 3, "Currently only 3x3 direct convolution is supported!"); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.stride()) > 2), "Strides larger than 2 not supported in 3x3 direct convolution!"); + + ARM_COMPUTE_ERROR_ON(kernel_size != weights->info()->dimension(0)); + + if(biases != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3)); + ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1); + } + + _conv_stride_x = std::get<0>(conv_info.stride()); + _conv_stride_y = std::get<1>(conv_info.stride()); + _conv_pad_x = std::get<0>(conv_info.pad()); + _conv_pad_y = std::get<1>(conv_info.pad()); + + _input = input; + _weights = weights; + _output = output; + _biases = biases; + _border_size = BorderSize(_conv_pad_y, _conv_pad_x); + + std::stringstream kernel_name; + std::set<std::string> options; + kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; + + options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + + options.emplace("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)); + + if(_biases != nullptr) + { + options.emplace("-DHAS_BIAS"); + } + + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options)); + + unsigned int idx = (_biases == nullptr) ? 3 * num_arguments_per_3D_tensor() : (num_arguments_per_1D_tensor() + 3 * num_arguments_per_3D_tensor()); + _kernel.setArg<cl_uint>(idx++, _weights->info()->strides_in_bytes()[3]); // weights_stride_w + _kernel.setArg<cl_uint>(idx++, _weights->info()->dimension(2)); // filter depth + + // Using this local workgroup size gives better performance over others that have been tried. + _lws_hint = cl::NDRange(4, 1, 8); + + // Configure kernel window + Window win = calculate_max_window(*output->info()); + + unsigned int num_elems_read_per_iteration = 16 * _conv_stride_x; + unsigned int num_elems_written_per_iteration = 8; + + // Calculate right and bottom border + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + const int upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * _conv_stride_x + kernel_size), num_elems_read_per_iteration) - _conv_pad_x - input_width; + const int upper_bound_h = ((output->info()->dimension(1) - 1) * _conv_stride_y - _conv_pad_y + kernel_size) - input_height; + const int padding_right = std::max(upper_bound_w, static_cast<int>(kernel_size)); + const int padding_bottom = std::max(upper_bound_h, static_cast<int>(kernel_size)); + + // Create window and update padding + win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration)); + AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + padding_right, input_height + padding_bottom); + + AccessWindowStatic weights_access(weights->info(), 0, 0, kernel_size, kernel_size); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + update_window_and_padding(win, input_access, weights_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +template <unsigned int kernel_size> +void CLDirectConvolutionLayerKernel<kernel_size>::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + // Get initial windows + Window slice = window.first_slice_window_3D(); + Window win_in = window; + + win_in.adjust(Window::DimX, -_conv_pad_x, true); + win_in.adjust(Window::DimY, -_conv_pad_y, true); + win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x); + win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y); + + Window slice_in = win_in.first_slice_window_3D(); + + unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); + add_3D_tensor_argument(idx1, _weights, slice); + + if(_biases != nullptr) + { + Window slice_biases; + slice_biases.use_tensor_dimensions(_biases->info()); + add_1D_tensor_argument(idx1, _biases, slice_biases); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice, _lws_hint); + } + while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); +} + +template class arm_compute::CLDirectConvolutionLayerKernel<3>; diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp new file mode 100644 index 0000000000..65be417afb --- /dev/null +++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" + +#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLDirectConvolutionLayer::CLDirectConvolutionLayer() + : _direct_conv_kernel(), _input_border_handler() +{ +} + +void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) +{ + _direct_conv_kernel.configure(input, weights, biases, output, conv_info); + + _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0)); +} + +void CLDirectConvolutionLayer::run() +{ + CLScheduler::get().enqueue(_input_border_handler, false); + CLScheduler::get().enqueue(_direct_conv_kernel); +} diff --git a/tests/dataset/ConvolutionLayerDataset.h b/tests/dataset/ConvolutionLayerDataset.h index af04fa98ea..3d8ec4a2d9 100644 --- a/tests/dataset/ConvolutionLayerDataset.h +++ b/tests/dataset/ConvolutionLayerDataset.h @@ -93,12 +93,13 @@ public: }; /** Data set containing direct convolution tensor shapes. */ -class DirectConvolutionShapes final : public ShapeDataset<3> +class DirectConvolutionShapes final : public ShapeDataset<4> { public: DirectConvolutionShapes() : ShapeDataset(TensorShape(3U, 3U, 3U, 2U, 4U, 5U), TensorShape(32U, 37U, 3U), + TensorShape(64U, 32U, 4U, 2U), TensorShape(13U, 15U, 8U, 3U)) { } diff --git a/tests/validation/CL/CMakeLists.txt b/tests/validation/CL/CMakeLists.txt index 93547b1879..f4e99cfe01 100644 --- a/tests/validation/CL/CMakeLists.txt +++ b/tests/validation/CL/CMakeLists.txt @@ -37,6 +37,7 @@ set(arm_compute_test_validation_OPENCL_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Sobel3x3.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Sobel5x5.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Threshold.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/DirectConvolutionLayer.cpp ) add_library(arm_compute_test_validation_OPENCL OBJECT diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp new file mode 100644 index 0000000000..5b00a019ba --- /dev/null +++ b/tests/validation/CL/DirectConvolutionLayer.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "AssetsLibrary.h" +#include "CL/CLAccessor.h" +#include "Globals.h" +#include "TypePrinter.h" +#include "Utils.h" +#include "validation/Datasets.h" +#include "validation/Reference.h" +#include "validation/Validation.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" +#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/TensorAllocator.h" + +#include "boost_wrapper.h" + +#include <random> +#include <string> +#include <tuple> + +using namespace arm_compute; +using namespace arm_compute::test; +using namespace arm_compute::test::validation; + +namespace +{ +const float tolerance_fp = 1e-3f; /**< Tolerance for floating point tests */ + +/** Compute CL direct convolution layer function. + * + * @param[in] src_shape Shape of the input tensor. + * @param[in] weights_shape Shape of the weights. + * @param[in] bias_shape Shape of the bias tensor. + * @param[in] dst_shape Shape of the output tensor. + * @param[in] dt Data type of input, convolution matrix and output tensors. + * @param[in] conv_info Padding and stride information. + * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers + * + * @return Computed output tensor. +*/ +CLTensor compute_convolution_layer(const TensorShape &src_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &dst_shape, + DataType dt, PadStrideInfo conv_info, int fixed_point_position = 0) +{ + // Create tensors + CLTensor src = create_tensor<CLTensor>(src_shape, dt, 1, fixed_point_position); + CLTensor weights = create_tensor<CLTensor>(weights_shape, dt, 1, fixed_point_position); + + CLTensor bias = create_tensor<CLTensor>(bias_shape, dt, 1, fixed_point_position); + CLTensor dst = create_tensor<CLTensor>(dst_shape, dt, 1, fixed_point_position); + + // Create and configure function + CLDirectConvolutionLayer conv_layer; + conv_layer.configure(&src, &weights, &bias, &dst, conv_info); + + // Allocate tensors + src.allocator()->allocate(); + weights.allocator()->allocate(); + dst.allocator()->allocate(); + bias.allocator()->allocate(); + + BOOST_TEST(!src.info()->is_resizable()); + BOOST_TEST(!weights.info()->is_resizable()); + BOOST_TEST(!dst.info()->is_resizable()); + BOOST_TEST(!bias.info()->is_resizable()); + + // Fill tensors + switch(dt) + { + case DataType::F32: + { + std::uniform_real_distribution<> distribution(-1.f, 1.f); + library->fill(CLAccessor(src), distribution, 0); + library->fill(CLAccessor(weights), distribution, 1); + library->fill(CLAccessor(bias), distribution, 2); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + } + } + + // Compute function + conv_layer.run(); + + return dst; +} + +TensorShape get_output_shape(TensorShape in_shape, TensorShape kernel_shape, const PadStrideInfo &conv_info) +{ + TensorShape out_shape(in_shape); + const std::pair<unsigned int, unsigned int> scaled_dims = arm_compute::scaled_dimensions(in_shape.x(), + in_shape.y(), + kernel_shape.x(), + kernel_shape.y(), + conv_info); + out_shape.set(0, scaled_dims.first); + out_shape.set(1, scaled_dims.second); + out_shape.set(2, kernel_shape[3]); + return out_shape; +} + +} // namespace + +#ifndef DOXYGEN_SKIP_THIS +BOOST_AUTO_TEST_SUITE(CL) +BOOST_AUTO_TEST_SUITE(DirectConvolutionLayer) + +BOOST_AUTO_TEST_SUITE(Float) + +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * CNNFloatDataTypes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(0, 2, + 1) + * boost::unit_test::data::xrange(0, 2, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }), + input_shape, dt, sx, sy, px, py, num_kernels) +{ + const unsigned int kernel_size = 3; + const PadStrideInfo conv_info(sx, sy, px, py, DimensionRoundingType::FLOOR); + const TensorShape w_shape(kernel_size, kernel_size, input_shape.z(), static_cast<unsigned int>(num_kernels)); + const TensorShape b_shape(static_cast<unsigned int>(num_kernels)); + const TensorShape d_shape(get_output_shape(input_shape, w_shape, conv_info)); + + CLTensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info); + + RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, 0); + + // Validate output + validate(CLAccessor(dst), ref, tolerance_fp); +} +BOOST_AUTO_TEST_SUITE_END() + +BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE_END() +#endif /* DOXYGEN_SKIP_THIS */ diff --git a/tests/validation/NEON/CMakeLists.txt b/tests/validation/NEON/CMakeLists.txt index 988e1633f5..9dda17d149 100644 --- a/tests/validation/NEON/CMakeLists.txt +++ b/tests/validation/NEON/CMakeLists.txt @@ -37,7 +37,7 @@ set(arm_compute_test_validation_NEON_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/BitwiseXor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Box3x3.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ConvolutionLayer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ConvolutionLayerDirect.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/DirectConvolutionLayer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/DepthConvert.cpp ${CMAKE_CURRENT_SOURCE_DIR}/FillBorder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Fixedpoint/Exp_QS8.cpp diff --git a/tests/validation/NEON/ConvolutionLayerDirect.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp index effb898428..034a8b2045 100644 --- a/tests/validation/NEON/ConvolutionLayerDirect.cpp +++ b/tests/validation/NEON/DirectConvolutionLayer.cpp @@ -150,7 +150,7 @@ BOOST_DATA_TEST_CASE(W1x1, RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, 0); // Validate output - validate(Accessor(dst), ref); + validate(NEAccessor(dst), ref); } BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) @@ -172,7 +172,7 @@ BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::m RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, 0); // Validate output - validate(Accessor(dst), ref, tolerance_fp16); + validate(NEAccessor(dst), ref, tolerance_fp16); } BOOST_AUTO_TEST_SUITE_END() #endif /* ARM_COMPUTE_ENABLE_FP16 */ |