From df4cf57c7394265b27d051cb1cf0152c53659126 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 9 Oct 2019 15:32:39 +0100 Subject: COMPMID-2306: CLDepthwiseConvolution: support for QUANT8_PER_CHANNEL_SYMM Change-Id: I18c886400daa2dcba0b91011bc4e503d807a4732 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2143 Comments-Addressed: Arm Jenkins Reviewed-by: Giorgio Arena Tested-by: Arm Jenkins --- arm_compute/core/CL/CLHelpers.h | 16 + .../CLDepthwiseConvolutionLayer3x3NCHWKernel.h | 58 +- .../CLDepthwiseConvolutionLayer3x3NHWCKernel.h | 56 +- .../CLDepthwiseConvolutionLayerNativeKernel.h | 61 +- .../ICLDepthwiseConvolutionLayer3x3Kernel.h | 33 +- .../NEDepthwiseConvolutionLayerNativeKernel.h | 12 +- arm_compute/core/utils/quantization/AsymmHelpers.h | 17 +- .../CL/functions/CLDepthwiseConvolutionLayer.h | 37 +- .../NEON/functions/NEDepthwiseConvolutionLayer.h | 33 +- src/core/CL/CLHelpers.cpp | 48 + src/core/CL/CLKernelLibrary.cpp | 10 +- .../cl_kernels/depthwise_convolution_quantized.cl | 1201 ++++++++++++-------- src/core/CL/cl_kernels/helpers_asymm.h | 22 +- .../CL/kernels/CLChannelShuffleLayerKernel.cpp | 16 +- .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp | 134 ++- .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp | 133 ++- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 131 ++- ...pthwiseConvolutionLayerReshapeWeightsKernel.cpp | 3 +- .../CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp | 22 +- .../CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp | 22 +- .../CL/kernels/CLHeightConcatenateLayerKernel.cpp | 30 +- src/core/CL/kernels/CLPermuteKernel.cpp | 5 - src/core/CL/kernels/CLReverseKernel.cpp | 15 +- src/core/Utils.cpp | 3 + src/core/utils/quantization/AsymmHelpers.cpp | 22 + .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 204 +++- tests/validation/CL/DepthwiseConvolutionLayer.cpp | 109 ++ 27 files changed, 1570 insertions(+), 883 deletions(-) diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index 1d647a86b0..9130e05121 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -50,6 +50,22 @@ static constexpr unsigned int max_cl_vector_width = 16; */ std::string get_cl_type_from_data_type(const DataType &dt); +/** Translates a tensor data type to the appropriate OpenCL promoted type. + * + * @param[in] dt @ref DataType to be used to get the promoted OpenCL type. + * + * @return The string specifying the OpenCL type to be used. + */ +std::string get_cl_promoted_type_from_data_type(const DataType &dt); + +/** Translates the element size to an unsigned integer data type + * + * @param[in] element_size Size in bytes of an element. + * + * @return The string specifying the OpenCL type to be used. + */ +std::string get_cl_unsigned_type_from_element_size(size_t element_size); + /** Translates a tensor data type to the appropriate OpenCL select type. * * @param[in] dt @ref DataType to be translated to OpenCL select type. diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h index 3b7fc7b7dc..a2f61ee058 100644 --- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h @@ -39,35 +39,47 @@ public: CLDepthwiseConvolutionLayer3x3NCHWKernel(); /** Initialize the function's source, destination, conv and border_size. * - * @param[in] input Source tensor. DataType supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] input Source tensor. DataType supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 */ - void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - ActivationLayerInfo act_info, const Size2D &dilation) override; + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U), + const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override; /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NCHWKernel * - * @param[in] input Source tensor info. DataType supported: F16/F32/QASYMM8. - * @param[in] weights Weights tensor info. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[in] output Destination tensor. Data type supported: Same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported. - * @param[in] gpu_target (Optional) GPU target to validate the kernel for. Defaults to midgard. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] input Source tensor info. DataType supported: F16/F32/QASYMM8. + * @param[in] weights Weights tensor info. A 3D tensor with dimensions [3, 3, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. + * @param[in] output Destination tensor. Data type supported: Same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported. + * @param[in] gpu_target (Optional) GPU target to validate the kernel for. Defaults to midgard. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD, const Size2D &dilation = Size2D(1U, 1U)); + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD, + const Size2D &dilation = Size2D(1U, 1U), const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); void run(const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h index 7d0ecec13e..e8cca954e2 100644 --- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h @@ -40,34 +40,46 @@ public: /** Default move assignment operator. */ /** Initialize the function's source, destination, conv and border_size. * - * @param[in] input Source tensor. DataType supported: QASYMM8. - * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, 3, 3]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] input Source tensor. DataType supported: QASYMM8. + * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, 3, 3]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 */ - void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - ActivationLayerInfo act_info, const Size2D &dilation) override; + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U), + const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override; /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel * - * @param[in] input Source tensor info. DataType supported: QASYMM8. - * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[in] output Destination tensor info. Data type supported: Same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] input Source tensor info. DataType supported: QASYMM8. + * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. + * @param[in] output Destination tensor info. Data type supported: Same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U), + const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h index 31ec871123..8e8df9c1f6 100644 --- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h @@ -49,36 +49,48 @@ public: CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default; /** Initialize the function's source, destination and parameters * - * @param[in] input Source tensor. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC - * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread - * @param[in] dwc_info Depthwise convolution layer info - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] input Source tensor. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC + * @param[in] weights Weights tensor. A 3D tensor with dimensions [IFM, N, M]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread + * @param[in] dwc_info Depthwise convolution layer info + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 */ - void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, - const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U)); + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, + const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U), + const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel * - * @param[in] input Source tensor info. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC - * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, N, M]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[in] output Destination tensor info. Data type supported: Same as @p input. - * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread - * @param[in] dwc_info Depthwise convolution layer info - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] input Source tensor info. Data type supported: QASYMM8/FP32/FP16. Data layout supported: NHWC + * @param[in] weights Weights tensor info. A 3D tensor with dimensions [IFM, N, M]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. + * @param[in] output Destination tensor info. Data type supported: Same as @p input. + * @param[in] dwc_weights_info Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread + * @param[in] dwc_info Depthwise convolution layer info + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 * * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U)); + const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U), + const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -89,6 +101,9 @@ private: const ICLTensor *_biases; ICLTensor *_output; unsigned int _depth_multiplier; + const ICLTensor *_output_multipliers; + const ICLTensor *_output_shifts; + bool _is_quantized; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h index 92eca89fd8..a6b4510115 100644 --- a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h +++ b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h @@ -37,7 +37,7 @@ class ICLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel public: /** Default constructor */ ICLDepthwiseConvolutionLayer3x3Kernel() - : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1) + : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1), _output_multipliers(), _output_shifts(), _is_quantized(false) { } /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -50,18 +50,24 @@ public: ICLDepthwiseConvolutionLayer3x3Kernel &operator=(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default; /** Initialize the function's source, destination, conv and border_size. * - * @param[in] input Source tensor. DataType supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] input Source tensor. DataType supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. + * Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 + * @param[in] output_shifts (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization, + * the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32 */ - virtual void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, - ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)) = 0; + virtual void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U), + const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0; protected: BorderSize _border_size; @@ -70,6 +76,9 @@ protected: const ICLTensor *_weights; const ICLTensor *_biases; unsigned int _conv_stride_y; + const ICLTensor *_output_multipliers; + const ICLTensor *_output_shifts; + bool _is_quantized; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h index a0205f1ea6..6db1a767d8 100644 --- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h @@ -58,8 +58,10 @@ public: * @note Supported data layouts: NHWC * * @param[in] input Source tensor. DataType supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[out] output Destination tensor. Data type supported: Same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. @@ -73,8 +75,10 @@ public: * @note Supported data layouts: NHWC * * @param[in] input Source tensor info. DataType supported: QASYMM8/F16/F32. - * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. Data type supported: Same as @p input. + * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. + * @param[in] biases Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] output Destination tensor info. Data type supported: Same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h index bc5b9dbdba..8ec4a331f6 100644 --- a/arm_compute/core/utils/quantization/AsymmHelpers.h +++ b/arm_compute/core/utils/quantization/AsymmHelpers.h @@ -25,6 +25,7 @@ #define __ARM_COMPUTE_QUANTIZATION_ASYMM_HELPERS_H__ #include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" namespace arm_compute @@ -60,9 +61,23 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, int *quant Status calculate_quantized_multiplier_greater_than_one(float multiplier, int *quantized_multiplier, int *left_shift); /** Get minimum and maximum values for the input quantized data type * - * @ return min and max values for the quantized data type + * @return min and max values for the quantized data type */ std::pair get_min_max_values_from_quantized_data_type(DataType data_type); +/** Compute quantized per-channel multipliers and shifts. As many multipliers + * and shifts as output channels are computed. If weights are not quantized + * per-channel, multipliers and shifts will end up being the same for each + * channel. + * + * @param[in] input Input tensor. + * @param[in] weights Weights tensor. + * @param[in] output Output tensor. + * @param[out] output_multipliers_ptr Pointer to the buffer where to store per-channel multipliers. + * @param[out] output_shifts_ptr Pointer to the buffer where to store per-channel shifts. + * + * @return min and max values for the quantized data type + */ +void compute_quantized_multipliers_and_shifts(const ITensor *input, const ITensor *weights, const ITensor *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr); } // namespace quantization } // namespace arm_compute #endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h index b8b11f08b2..e15f62f779 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h @@ -58,7 +58,8 @@ public: /** Initialize the function's source, destination, weights and convolution information. * * @param[in, out] input Source tensor. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW - * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[out] output Destination tensor. Data type supported: same as @p input. @@ -73,7 +74,8 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer * * @param[in] input Source tensor info. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] output Destination tensor. Data type supported: same as @p input. @@ -95,7 +97,8 @@ private: /** Static function to choose the best depthwise convolution function for @ref CLDepthwiseConvolutionLayer * * @param[in] input Source tensor info. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] output Destination tensor. Data type supported: same as @p input. @@ -135,7 +138,8 @@ private: /** Initialize the function's source, destination, conv and border_size. * * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. A 3D tensor with shape [3, 3, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input. * @param[out] output Destination tensor. Data type supported: same as @p input. @@ -150,7 +154,8 @@ private: /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3 * * @param[in] input Source tensor info. Data type supported: QASYMM8 for all layouts, F16/F32 for NCHW. - * @param[in] weights Weights tensor info. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor info. A 3D tensor with shape [3, 3, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] output Destination tensor. Data type supported: same as @p input. @@ -184,10 +189,15 @@ private: CLTensor _permuted_input; CLTensor _permuted_weights; CLTensor _permuted_output; + CLTensor _output_multipliers; + CLTensor _output_shifts; const ITensor *_original_weights; + const ITensor *_input; + const ITensor *_output; bool _needs_permute; bool _needs_weights_reshape; bool _is_prepared; + bool _is_quantized; }; /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels: @@ -212,7 +222,8 @@ private: /** Initialize the function's source, destination, weights and convolution information. * * @param[in, out] input Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling). - * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[out] output Destination tensor. Data type supported: same as @p input. @@ -227,7 +238,8 @@ private: /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerGeneric * * @param[in] input Source tensor info. Data type supported: QASYMM8/F32. - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] output Destination tensor. Data type supported: same as @p input. @@ -261,10 +273,15 @@ private: CLTensor _permuted_input; CLTensor _permuted_weights; CLTensor _permuted_output; + CLTensor _output_multipliers; + CLTensor _output_shifts; const ITensor *_original_weights; + const ITensor *_input; + const ITensor *_output; bool _needs_permute; bool _is_prepared; + bool _is_quantized; }; std::shared_ptr _memory_manager; @@ -298,7 +315,8 @@ public: /** Initialize the function's source, destination, conv and border_size. * * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. A 3D tensor with shape [3, 3, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input. * @param[out] output Destination tensor. Data type supported: same as @p input. @@ -314,7 +332,8 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3 * * @param[in] input Source tensor info. Data type supported: QASYMM8 for all layouts, F16/F32 for NCHW. - * @param[in] weights Weights tensor info. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor info. A 3D tensor with shape [3, 3, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] output Destination tensor. Data type supported: same as @p input. diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index 8fe9644963..efe9cdfbf0 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -56,9 +56,10 @@ public: * * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32 * @param[out] output Destination tensor. Data type supported: same as @p input. - * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. @@ -71,9 +72,10 @@ public: * * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 * @param[in] output Destination tensor. Data type supported: same as @p input. - * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. @@ -92,9 +94,10 @@ private: /** Static function to choose the best depthwise convolution function for @ref NEDepthwiseConvolutionLayer * * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32 - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. @@ -136,7 +139,7 @@ private: * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. @@ -151,7 +154,7 @@ private: * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. @@ -173,7 +176,7 @@ private: * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. @@ -188,7 +191,7 @@ private: * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. @@ -246,7 +249,8 @@ private: * * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[out] output Destination tensor. Data type supported: same as @p input. - * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] conv_info Padding and stride information to use for the convolution. @@ -261,7 +265,8 @@ private: * * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[in] output Destination tensor. Data type supported: same as @p input. - * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] conv_info Padding and stride information to use for the convolution. @@ -328,7 +333,7 @@ public: * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. @@ -344,7 +349,7 @@ public: * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. * @param[in] biases Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. + * Data type supported: Same as @p input, S32 when input is QASYMM8. * @param[in] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index a3c73677c7..1132aa4540 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -70,6 +70,54 @@ std::string get_cl_type_from_data_type(const DataType &dt) } } +std::string get_cl_promoted_type_from_data_type(const DataType &dt) +{ + switch(dt) + { + case DataType::U8: + case DataType::QASYMM8: + case DataType::QASYMM8_PER_CHANNEL: + return "ushort"; + case DataType::S8: + case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + return "short"; + case DataType::U16: + case DataType::QASYMM16: + return "uint"; + case DataType::S16: + case DataType::QSYMM16: + return "int"; + case DataType::U32: + return "ulong"; + case DataType::S32: + return "long"; + case DataType::F16: + return "float"; + default: + ARM_COMPUTE_ERROR("Cannot get promoted OpenCL type for the input data type."); + return ""; + } +} + +std::string get_cl_unsigned_type_from_element_size(size_t element_size) +{ + switch(element_size) + { + case 1: + return "uchar"; + case 2: + return "ushort"; + case 4: + return "uint"; + case 8: + return "ulong"; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + return ""; + } +} + std::string get_cl_select_type_from_data_type(const DataType &dt) { switch(dt) diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index b2905a848b..5d5205439e 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -223,11 +223,11 @@ const std::map CLKernelLibrary::_kernel_program_map = { "depthwise_convolution_3x3_nhwc_stride1", "depthwise_convolution.cl" }, { "dwc_MxN_native_fp_nhwc", "depthwise_convolution.cl" }, { "dwc_MxN_native_quantized8_nhwc", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_native_qasymm8_nchw", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_native_qasymm8_dot8_nchw", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_reshaped_qasymm8_nhwc", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_reshaped_qasymm8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, - { "dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_native_quantized8_nchw", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_native_quantized8_dot8_nchw", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_reshaped_quantized8_nhwc", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_reshaped_quantized8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, + { "dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc", "depthwise_convolution_quantized.cl" }, { "depth_to_space_nchw", "depth_to_space.cl" }, { "depth_to_space_nhwc", "depth_to_space.cl" }, { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl" }, diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index 94373b74e7..dbcfae610f 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -33,7 +33,6 @@ #endif /* VEC_SIZE */ #if defined(ACTIVATION_TYPE) && defined(CONST_0) -#define DATA_TYPE uchar #include "activation_layer_quant.cl" #define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QUANT(ACTIVATION_TYPE, x) #else /* defined(ACTIVATION_TYPE) && defined(CONST_0) */ @@ -42,11 +41,16 @@ #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) -#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) -#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE) #define VEC_SHORT VEC_DATA_TYPE(short, VEC_SIZE) -#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) +#if defined(DATA_TYPE) && defined(WEIGHTS_TYPE) + +#define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size) + +#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER)) + +#if defined(WEIGHTS_PROMOTED_TYPE) +#define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size) #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) @@ -62,77 +66,77 @@ #error "Stride X not supported" #endif /* CONV_STRIDE_X > 3 */ -#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)) +#if !defined(IS_DOT8) #if DILATION_X == 1 #if CONV_STRIDE_X == 1 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int8 temp0 = CONVERT(vload8(0, first_value), int8); \ - int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int8 temp0 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \ + int2 temp1 = CONVERT(vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))), int2); \ \ - left = CONVERT(temp0.s01234567, int8); \ - middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \ - right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \ + left = CONVERT(temp0.s01234567, int8); \ + middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \ + right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \ }) #elif CONV_STRIDE_X == 2 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int16 temp0 = CONVERT(vload16(0, first_value), int16); \ - int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ + int temp1 = CONVERT(*((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int); \ \ - left = CONVERT(temp0.s02468ace, int8); \ - middle = CONVERT(temp0.s13579bdf, int8); \ - right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \ + left = CONVERT(temp0.s02468ace, int8); \ + middle = CONVERT(temp0.s13579bdf, int8); \ + right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \ }) #else /* CONV_STRIDE_X */ -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int16 temp0 = CONVERT(vload16(0, first_value), int16); \ - int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ + int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \ \ - left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ - middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \ - right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \ + left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ + middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \ + right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \ }) #endif /* CONV_STRIDE_X */ #else /* DILATION_X == 1 */ #if CONV_STRIDE_X == 1 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - left = CONVERT(vload8(0, first_value), int8); \ - middle = CONVERT(vload8(0, first_value + DILATION_X * sizeof(uchar)), int8); \ - right = CONVERT(vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)), int8); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + left = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \ + middle = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int8); \ + right = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int8); \ }) #elif CONV_STRIDE_X == 2 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int16 temp0 = CONVERT(vload16(0, first_value), int16); \ - left = CONVERT(temp0.s02468ace, int8); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ + left = CONVERT(temp0.s02468ace, int8); \ \ - temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \ - middle = CONVERT(temp0.s02468ace, int8); \ + temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \ + middle = CONVERT(temp0.s02468ace, int8); \ \ - temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \ - right = CONVERT(temp0.s02468ace, int8); \ + temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \ + right = CONVERT(temp0.s02468ace, int8); \ }) #else /* CONV_STRIDE_X */ -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - int16 temp0 = CONVERT(vload16(0, first_value), int16); \ - int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \ - left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ + int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \ + left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ \ - temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \ - temp1 = CONVERT(vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))), int8); \ - middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ + temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \ + temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))), int8); \ + middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ \ - temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \ - temp1 = CONVERT(vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))), int8); \ - right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ + temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \ + temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))), int8); \ + right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ }) #endif /* CONV_STRIDE_X */ @@ -140,49 +144,61 @@ /** This function computes the depthwise convolution quantized. * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8 - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8 - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector */ -__kernel void dwc_3x3_native_qasymm8_nchw( +__kernel void dwc_3x3_native_quantized8_nchw( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights) + TENSOR3D_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts) #if defined(HAS_BIAS) , VECTOR_DECLARATION(biases) #endif //defined(HAS_BIAS) ) { - Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts); // Extract channel and linearized batch indices const int channel = get_global_id(2) % DST_CHANNELS; @@ -198,9 +214,20 @@ __kernel void dwc_3x3_native_qasymm8_nchw( src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z; __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z; - uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y); - uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y); - uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y); + VEC_DATA_TYPE(WEIGHTS_TYPE, 3) + w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 3) + w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 3) + w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y)); + +#if defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel)); + const int output_shift = *((__global int *)vector_offset(&output_shifts, channel)); +#else // defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0)); + const int output_shift = *((__global int *)vector_offset(&output_shifts, 0)); +#endif // defined(PER_CHANNEL_QUANTIZATION) int8 values0 = 0; int8 sum0 = 0; @@ -285,9 +312,10 @@ __kernel void dwc_3x3_native_qasymm8_nchw( #endif /* WEIGHTS_OFFSET != 0 */ #if INPUT_OFFSET != 0 - ushort sum_weights = 0; - ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2); - sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2; + VEC_WEIGHTS_PROMOTED_TYPE(3) + tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3)); + + WEIGHTS_PROMOTED_TYPE sum_weights = tmp_we.s0 + tmp_we.s1 + tmp_we.s2; values0 += sum_weights * (int8)(INPUT_OFFSET); #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 values1 += sum_weights * (int8)(INPUT_OFFSET); @@ -307,14 +335,13 @@ __kernel void dwc_3x3_native_qasymm8_nchw( #else // defined(REAL_MULTIPLIER) - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); #endif // defined(REAL_MULTIPLIER) values0 += (int8)OUTPUT_OFFSET; - uchar8 res0 = convert_uchar8_sat(values0); - res0 = max(res0, (uchar8)0); - res0 = min(res0, (uchar8)255); + VEC_TYPE(8) + res0 = CONVERT_SAT(values0, VEC_TYPE(8)); vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr); #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 @@ -324,134 +351,156 @@ __kernel void dwc_3x3_native_qasymm8_nchw( #else // defined(REAL_MULTIPLIER) - values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); #endif // defined(REAL_MULTIPLIER) values1 += (int8)OUTPUT_OFFSET; - uchar8 res1 = convert_uchar8_sat(values1); - res1 = max(res1, (uchar8)0); - res1 = min(res1, (uchar8)255); + VEC_TYPE(8) + res1 = CONVERT_SAT(values1, VEC_TYPE(8)); vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y); #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/ } -#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)) +#else // !defined(IS_DOT8) + #if DILATION_X == 1 #if CONV_STRIDE_X == 1 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar8 temp0 = vload8(0, first_value); \ - uchar2 temp1 = vload2(0, (first_value + 8 * sizeof(uchar))); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(8) \ + temp0 = vload8(0, (__global DATA_TYPE *)(first_value)); \ + VEC_TYPE(2) \ + temp1 = vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))); \ \ - left = temp0.s01234567; \ - middle = (uchar8)(temp0.s1234, temp0.s567, temp1.s0); \ - right = (uchar8)(temp0.s2345, temp0.s67, temp1.s01); \ + left = temp0.s01234567; \ + middle = (VEC_TYPE(8))(temp0.s1234, temp0.s567, temp1.s0); \ + right = (VEC_TYPE(8))(temp0.s2345, temp0.s67, temp1.s01); \ }) #elif CONV_STRIDE_X == 2 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar16 temp0 = vload16(0, first_value); \ - uchar temp1 = *(first_value + 16 * sizeof(uchar)); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(16) \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ + DATA_TYPE temp1 = *((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \ \ - left = temp0.s02468ace; \ - middle = temp0.s13579bdf; \ - right = (uchar8)(temp0.s2468, temp0.sace, temp1); \ + left = temp0.s02468ace; \ + middle = temp0.s13579bdf; \ + right = (VEC_TYPE(8))(temp0.s2468, temp0.sace, temp1); \ }) #else /* CONV_STRIDE_X */ -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar16 temp0 = vload16(0, first_value); \ - uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(16) \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ + VEC_TYPE(8) \ + temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \ \ - left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \ - middle = (uchar8)(temp0.s147a, temp0.sd, temp1.s036); \ - right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \ + left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ + middle = (VEC_TYPE(8))(temp0.s147a, temp0.sd, temp1.s036); \ + right = (VEC_TYPE(8))(temp0.s258b, temp0.se, temp1.s147); \ }) #endif /* CONV_STRIDE_X */ #else /*DILATION_X==1*/ #if CONV_STRIDE_X == 1 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - left = vload8(0, first_value); \ - middle = vload8(0, first_value + DILATION_X * sizeof(uchar)); \ - right = vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + left = vload8(0, (__global DATA_TYPE *)(first_value)); \ + middle = vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \ + right = vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \ }) #elif CONV_STRIDE_X == 2 -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar16 temp0 = vload16(0, first_value); \ - left = temp0.s02468ace; \ - temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \ - middle = temp0.s02468ace; \ - temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \ - right = temp0.s02468ace; \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(16) \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ + left = temp0.s02468ace; \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \ + middle = temp0.s02468ace; \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \ + right = temp0.s02468ace; \ }) #else /* CONV_STRIDE_X */ -#define GET_VALUES(first_value, left, middle, right) \ - ({ \ - uchar16 temp0 = vload16(0, first_value); \ - uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \ - left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \ +#define GET_VALUES(first_value, left, middle, right) \ + ({ \ + VEC_TYPE(16) \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ + VEC_TYPE(8) \ + temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE)))); \ + left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ \ - temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \ - temp1 = vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))); \ - middle = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \ + temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))); \ + middle = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ \ - temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \ - temp1 = vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))); \ - right = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \ + temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \ + temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))); \ + right = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ }) #endif /* CONV_STRIDE_X */ #endif /*DILATION_X==1*/ /** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW. * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8 - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8 - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @note Per-channel quantization is not supported by this kernel. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector */ -__kernel void dwc_3x3_native_qasymm8_dot8_nchw( +__kernel void dwc_3x3_native_quantized8_dot8_nchw( TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights) + TENSOR3D_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts) #if defined(HAS_BIAS) , VECTOR_DECLARATION(biases) #endif //defined(HAS_BIAS) ) { - Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); - Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts); // Extract channel and linearized batch indices const int channel = get_global_id(2) % DST_CHANNELS; @@ -467,13 +516,22 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z; __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z; - uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y); - uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y); - uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y); + VEC_TYPE(3) + w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y)); + VEC_TYPE(3) + w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y)); + VEC_TYPE(3) + w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y)); - uchar8 left0, middle0, right0; - uchar8 left1, middle1, right1; - uchar8 left2, middle2, right2; + const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0)); + const int output_shift = *((__global int *)vector_offset(&output_shifts, 0)); + + VEC_TYPE(8) + left0, middle0, right0; + VEC_TYPE(8) + left1, middle1, right1; + VEC_TYPE(8) + left2, middle2, right2; int8 values0 = 0; int8 sum0 = 0; @@ -491,9 +549,10 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 // If conv_stride_y is equals to 1, we compute two output rows - uchar8 left3, middle3, right3; - int8 values1 = 0; - int8 sum1 = 0; + VEC_TYPE(8) + left3, middle3, right3; + int8 values1 = 0; + int8 sum1 = 0; GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3); @@ -504,69 +563,69 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #endif /* WEIGHTS_OFFSET != 0 */ #endif // CONV_STRIDE_Y == 1 && DILATION_Y==1 - ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0); - ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0); + ARM_DOT((VEC_TYPE(4))(left0.s0, middle0.s0, right0.s0, left1.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0); + ARM_DOT((VEC_TYPE(4))(middle1.s0, right1.s0, left2.s0, middle2.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0); values0.s0 += right2.s0 * w2.s2; - ARM_DOT((uchar4)(left0.s1, middle0.s1, right0.s1, left1.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1); - ARM_DOT((uchar4)(middle1.s1, right1.s1, left2.s1, middle2.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1); + ARM_DOT((VEC_TYPE(4))(left0.s1, middle0.s1, right0.s1, left1.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1); + ARM_DOT((VEC_TYPE(4))(middle1.s1, right1.s1, left2.s1, middle2.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1); values0.s1 += right2.s1 * w2.s2; - ARM_DOT((uchar4)(left0.s2, middle0.s2, right0.s2, left1.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2); - ARM_DOT((uchar4)(middle1.s2, right1.s2, left2.s2, middle2.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2); + ARM_DOT((VEC_TYPE(4))(left0.s2, middle0.s2, right0.s2, left1.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2); + ARM_DOT((VEC_TYPE(4))(middle1.s2, right1.s2, left2.s2, middle2.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2); values0.s2 += right2.s2 * w2.s2; - ARM_DOT((uchar4)(left0.s3, middle0.s3, right0.s3, left1.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3); - ARM_DOT((uchar4)(middle1.s3, right1.s3, left2.s3, middle2.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3); + ARM_DOT((VEC_TYPE(4))(left0.s3, middle0.s3, right0.s3, left1.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3); + ARM_DOT((VEC_TYPE(4))(middle1.s3, right1.s3, left2.s3, middle2.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3); values0.s3 += right2.s3 * w2.s2; - ARM_DOT((uchar4)(left0.s4, middle0.s4, right0.s4, left1.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4); - ARM_DOT((uchar4)(middle1.s4, right1.s4, left2.s4, middle2.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4); + ARM_DOT((VEC_TYPE(4))(left0.s4, middle0.s4, right0.s4, left1.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4); + ARM_DOT((VEC_TYPE(4))(middle1.s4, right1.s4, left2.s4, middle2.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4); values0.s4 += right2.s4 * w2.s2; - ARM_DOT((uchar4)(left0.s5, middle0.s5, right0.s5, left1.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5); - ARM_DOT((uchar4)(middle1.s5, right1.s5, left2.s5, middle2.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5); + ARM_DOT((VEC_TYPE(4))(left0.s5, middle0.s5, right0.s5, left1.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5); + ARM_DOT((VEC_TYPE(4))(middle1.s5, right1.s5, left2.s5, middle2.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5); values0.s5 += right2.s5 * w2.s2; - ARM_DOT((uchar4)(left0.s6, middle0.s6, right0.s6, left1.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6); - ARM_DOT((uchar4)(middle1.s6, right1.s6, left2.s6, middle2.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6); + ARM_DOT((VEC_TYPE(4))(left0.s6, middle0.s6, right0.s6, left1.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6); + ARM_DOT((VEC_TYPE(4))(middle1.s6, right1.s6, left2.s6, middle2.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6); values0.s6 += right2.s6 * w2.s2; - ARM_DOT((uchar4)(left0.s7, middle0.s7, right0.s7, left1.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7); - ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7); + ARM_DOT((VEC_TYPE(4))(left0.s7, middle0.s7, right0.s7, left1.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7); + ARM_DOT((VEC_TYPE(4))(middle1.s7, right1.s7, left2.s7, middle2.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7); values0.s7 += right2.s7 * w2.s2; #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 - ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0); - ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0); + ARM_DOT((VEC_TYPE(4))(left1.s0, middle1.s0, right1.s0, left2.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0); + ARM_DOT((VEC_TYPE(4))(middle2.s0, right2.s0, left3.s0, middle3.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0); values1.s0 += right3.s0 * w2.s2; - ARM_DOT((uchar4)(left1.s1, middle1.s1, right1.s1, left2.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1); - ARM_DOT((uchar4)(middle2.s1, right2.s1, left3.s1, middle3.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1); + ARM_DOT((VEC_TYPE(4))(left1.s1, middle1.s1, right1.s1, left2.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1); + ARM_DOT((VEC_TYPE(4))(middle2.s1, right2.s1, left3.s1, middle3.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1); values1.s1 += right3.s1 * w2.s2; - ARM_DOT((uchar4)(left1.s2, middle1.s2, right1.s2, left2.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2); - ARM_DOT((uchar4)(middle2.s2, right2.s2, left3.s2, middle3.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2); + ARM_DOT((VEC_TYPE(4))(left1.s2, middle1.s2, right1.s2, left2.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2); + ARM_DOT((VEC_TYPE(4))(middle2.s2, right2.s2, left3.s2, middle3.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2); values1.s2 += right3.s2 * w2.s2; - ARM_DOT((uchar4)(left1.s3, middle1.s3, right1.s3, left2.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3); - ARM_DOT((uchar4)(middle2.s3, right2.s3, left3.s3, middle3.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3); + ARM_DOT((VEC_TYPE(4))(left1.s3, middle1.s3, right1.s3, left2.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3); + ARM_DOT((VEC_TYPE(4))(middle2.s3, right2.s3, left3.s3, middle3.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3); values1.s3 += right3.s3 * w2.s2; - ARM_DOT((uchar4)(left1.s4, middle1.s4, right1.s4, left2.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4); - ARM_DOT((uchar4)(middle2.s4, right2.s4, left3.s4, middle3.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4); + ARM_DOT((VEC_TYPE(4))(left1.s4, middle1.s4, right1.s4, left2.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4); + ARM_DOT((VEC_TYPE(4))(middle2.s4, right2.s4, left3.s4, middle3.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4); values1.s4 += right3.s4 * w2.s2; - ARM_DOT((uchar4)(left1.s5, middle1.s5, right1.s5, left2.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5); - ARM_DOT((uchar4)(middle2.s5, right2.s5, left3.s5, middle3.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5); + ARM_DOT((VEC_TYPE(4))(left1.s5, middle1.s5, right1.s5, left2.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5); + ARM_DOT((VEC_TYPE(4))(middle2.s5, right2.s5, left3.s5, middle3.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5); values1.s5 += right3.s5 * w2.s2; - ARM_DOT((uchar4)(left1.s6, middle1.s6, right1.s6, left2.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6); - ARM_DOT((uchar4)(middle2.s6, right2.s6, left3.s6, middle3.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6); + ARM_DOT((VEC_TYPE(4))(left1.s6, middle1.s6, right1.s6, left2.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6); + ARM_DOT((VEC_TYPE(4))(middle2.s6, right2.s6, left3.s6, middle3.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6); values1.s6 += right3.s6 * w2.s2; - ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7); - ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7); + ARM_DOT((VEC_TYPE(4))(left1.s7, middle1.s7, right1.s7, left2.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7); + ARM_DOT((VEC_TYPE(4))(middle2.s7, right2.s7, left3.s7, middle3.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7); values1.s7 += right3.s7 * w2.s2; #endif // CONV_STRIDE_Y == 1 && DILATION_Y==1 @@ -585,8 +644,9 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #endif /* WEIGHTS_OFFSET != 0 */ #if INPUT_OFFSET != 0 - ushort sum_weights = 0; - ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2); + WEIGHTS_PROMOTED_TYPE sum_weights = 0; + VEC_WEIGHTS_PROMOTED_TYPE(3) + tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3)); sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2; values0 += sum_weights * (int8)(INPUT_OFFSET); #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 @@ -607,14 +667,13 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #else // defined(REAL_MULTIPLIER) - values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8); #endif // defined(REAL_MULTIPLIER) values0 += (int8)OUTPUT_OFFSET; - uchar8 res0 = convert_uchar8_sat(values0); - res0 = max(res0, (uchar8)0); - res0 = min(res0, (uchar8)255); + VEC_TYPE(8) + res0 = CONVERT_SAT(values0, VEC_TYPE(8)); vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr); #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 @@ -625,20 +684,19 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #else // defined(REAL_MULTIPLIER) - values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); + values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8); #endif // defined(REAL_MULTIPLIER) values1 += (int8)OUTPUT_OFFSET; - uchar8 res1 = convert_uchar8_sat(values1); - res1 = max(res1, (uchar8)0); - res1 = min(res1, (uchar8)255); + VEC_TYPE(8) + res1 = CONVERT_SAT(values1, VEC_TYPE(8)); vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y); #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/ } -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#endif // !defined(IS_DOT8) #endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */ @@ -646,7 +704,7 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE) -#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT) +#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)) * CONVERT(y, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)), VEC_INT) #if WEIGHTS_OFFSET != 0 #define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \ @@ -661,23 +719,23 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \ ({ \ - ARM_DOT((uchar4)(val0, val1, val2, val3), w0.s0123, acc); \ - ARM_DOT((uchar4)(val4, val5, val6, val7), w0.s4567, acc); \ + ARM_DOT((VEC_TYPE(4))(val0, val1, val2, val3), w0.s0123, acc); \ + ARM_DOT((VEC_TYPE(4))(val4, val5, val6, val7), w0.s4567, acc); \ acc += val8 * w1; \ }) #define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \ ({ \ sum = val0; \ - ARM_DOT((uchar4)(val1, val2, val3, val4), (uchar4)1, sum); \ - ARM_DOT((uchar4)(val5, val6, val7, val8), (uchar4)1, sum); \ + ARM_DOT((VEC_TYPE(4))(val1, val2, val3, val4), (VEC_TYPE(4))1, sum); \ + ARM_DOT((VEC_TYPE(4))(val5, val6, val7, val8), (VEC_TYPE(4))1, sum); \ }) #define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \ ({ \ sum = w1; \ - ARM_DOT(w0.s0123, (uchar4)1, sum); \ - ARM_DOT(w0.s4567, (uchar4)1, sum); \ + ARM_DOT(w0.s0123, (VEC_TYPE(4))1, sum); \ + ARM_DOT(w0.s4567, (VEC_TYPE(4))1, sum); \ }) #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) @@ -694,42 +752,52 @@ __kernel void dwc_3x3_native_qasymm8_dot8_nchw( * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X) * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1) * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor reshaped. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector - * @param[in] max_offset Max offset for the input tensor + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] max_offset Max offset for the input tensor */ -__kernel void dwc_3x3_reshaped_qasymm8_nhwc( +__kernel void dwc_3x3_reshaped_quantized8_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), IMAGE_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts), #if defined(HAS_BIAS) VECTOR_DECLARATION(biases), #endif /* defined(HAS_BIAS) */ @@ -741,7 +809,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -749,7 +817,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( #if defined(DST_DEPTH) __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; #endif /* defined(DST_DEPTH) */ int z_coord = 0; @@ -768,19 +836,30 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( VEC_INT acc = 0, sum = 0; // Load weights - uchar16 w0_tmp = VLOAD(16)(0, weights_addr); - uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16); - uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16); - - uchar4 w0 = w0_tmp.s0123; - uchar4 w1 = w0_tmp.s4567; - uchar4 w2 = w0_tmp.s89AB; - uchar4 w3 = w0_tmp.sCDEF; - - uchar4 w4 = w1_tmp.s0123; - uchar4 w5 = w1_tmp.s4567; - uchar4 w6 = w1_tmp.s89AB; - uchar4 w7 = w1_tmp.sCDEF; + VEC_DATA_TYPE(WEIGHTS_TYPE, 16) + w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 16) + w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16)); + + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w0 = w0_tmp.s0123; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w1 = w0_tmp.s4567; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w2 = w0_tmp.s89AB; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w3 = w0_tmp.sCDEF; + + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w4 = w1_tmp.s0123; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w5 = w1_tmp.s4567; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w6 = w1_tmp.s89AB; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w7 = w1_tmp.sCDEF; #if INPUT_OFFSET != 0 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) @@ -798,27 +877,36 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( offset = y_offset + (int4)(z_coord * src_stride_z); offset = min(offset, (int4)max_offset); - VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); + VEC_TYPE(VEC_SIZE) + values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); // z == 1 // z_coord can be only negative for z = 0 so we do not need to clamp it // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset - z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y; - offset = y_offset + (int4)(z_coord * src_stride_z); - VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); + z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y; + offset = y_offset + (int4)(z_coord * src_stride_z); + VEC_TYPE(VEC_SIZE) + values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); // z == 2 // Offset can be out-of-bound so we need to check if it is greater than max_offset - z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2; - offset = y_offset + (int4)(z_coord * src_stride_z); - offset = min(offset, (int4)max_offset); - VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); + z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2; + offset = y_offset + (int4)(z_coord * src_stride_z); + offset = min(offset, (int4)max_offset); + VEC_TYPE(VEC_SIZE) + values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum); MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum); @@ -854,24 +942,34 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); -#else // defined(REAL_MULTIPLIER) +#else // defined(REAL_MULTIPLIER) +#if defined(PER_CHANNEL_QUANTIZATION) + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts); + VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr); + VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr); +#else // defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); + const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); +#endif // defined(PER_CHANNEL_QUANTIZATION) + + acc = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift); - acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); #endif // defined(REAL_MULTIPLIER) acc += (VEC_INT)OUTPUT_OFFSET; - VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR); - res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255); + VEC_TYPE(VEC_SIZE) + res = CONVERT_SAT(acc, VEC_TYPE(VEC_SIZE)); #if defined(DST_DEPTH) __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z; #endif /* defined(DST_DEPTH) */ VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res), 0, dst_addr); + (ACTIVATION_FUNC(res), 0, (__global DATA_TYPE *)(dst_addr)); } #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) @@ -887,43 +985,53 @@ __kernel void dwc_3x3_reshaped_qasymm8_nhwc( * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1) * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1). * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector - * @param[in] max_offset Max offset for the input tensor + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] max_offset Max offset for the input tensor */ -__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( +__kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), IMAGE_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts), #if defined(HAS_BIAS) VECTOR_DECLARATION(biases), #endif /* defined(HAS_BIAS) */ @@ -935,7 +1043,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -943,7 +1051,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( #if defined(DST_DEPTH) __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; #endif /* defined(DST_DEPTH) */ int z_coord = 0; @@ -965,19 +1073,30 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( VEC_INT acc3 = 0, sum3 = 0; // Load weights - uchar16 w0_tmp = VLOAD(16)(0, weights_addr); - uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16); - uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16); - - uchar4 w0 = w0_tmp.s0123; - uchar4 w1 = w0_tmp.s4567; - uchar4 w2 = w0_tmp.s89AB; - uchar4 w3 = w0_tmp.sCDEF; - - uchar4 w4 = w1_tmp.s0123; - uchar4 w5 = w1_tmp.s4567; - uchar4 w6 = w1_tmp.s89AB; - uchar4 w7 = w1_tmp.sCDEF; + VEC_DATA_TYPE(WEIGHTS_TYPE, 16) + w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 16) + w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16)); + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16)); + + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w0 = w0_tmp.s0123; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w1 = w0_tmp.s4567; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w2 = w0_tmp.s89AB; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w3 = w0_tmp.sCDEF; + + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w4 = w1_tmp.s0123; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w5 = w1_tmp.s4567; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w6 = w1_tmp.s89AB; + VEC_DATA_TYPE(WEIGHTS_TYPE, 4) + w7 = w1_tmp.sCDEF; #if INPUT_OFFSET != 0 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT) @@ -995,40 +1114,56 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( offset = y_offset + (int4)(z_coord * src_stride_z); offset = min(offset, (int4)max_offset); - VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + VEC_TYPE(VEC_SIZE) + values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 1 // z_coord can be only negative for z = 0 so we do not need to clamp it // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset - z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1; - offset = y_offset + (int4)(z_coord * src_stride_z); - VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1; + offset = y_offset + (int4)(z_coord * src_stride_z); + VEC_TYPE(VEC_SIZE) + values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 2 // After z = 1 we can simply add src_stride_z to offset without updating z_coord // However offset can be out-of-bound so we need to check if it is greater than max_offset offset += (int4)src_stride_z; - offset = min(offset, (int4)max_offset); - VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + offset = min(offset, (int4)max_offset); + VEC_TYPE(VEC_SIZE) + values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 3 // After z = 1 we can simply add src_stride_z to offset without updating z_coord // However offset can be out-of-bound so we need to check if it is greater than max_offset offset += (int4)(src_stride_z); - offset = min(offset, (int4)max_offset); - VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + offset = min(offset, (int4)max_offset); + VEC_TYPE(VEC_SIZE) + values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0); MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0); @@ -1115,10 +1250,20 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( #else // defined(REAL_MULTIPLIER) - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); +#if defined(PER_CHANNEL_QUANTIZATION) + Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers); + Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts); + VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr); + VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr); +#else // defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); + const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); +#endif // defined(PER_CHANNEL_QUANTIZATION) + + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); + acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift); + acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift); #endif // defined(REAL_MULTIPLIER) @@ -1127,15 +1272,14 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( acc2 += (VEC_INT)OUTPUT_OFFSET; acc3 += (VEC_INT)OUTPUT_OFFSET; - VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR); - VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR); - VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR); - VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR); - - res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255); - res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255); - res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255); - res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255); + VEC_TYPE(VEC_SIZE) + res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE)); + VEC_TYPE(VEC_SIZE) + res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE)); + VEC_TYPE(VEC_SIZE) + res2 = CONVERT_SAT(acc2, VEC_TYPE(VEC_SIZE)); + VEC_TYPE(VEC_SIZE) + res3 = CONVERT_SAT(acc3, VEC_TYPE(VEC_SIZE)); #if defined(DST_DEPTH) __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w; @@ -1153,15 +1297,16 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( #endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0) { VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res2), 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z); + (ACTIVATION_FUNC(res2), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z)); VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res3), 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z); + (ACTIVATION_FUNC(res3), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z)); } } #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product. * + * @note Per-channel quantization is not supported by this kernel. * @note This kernel assumes VEC_SIZE is 4. * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel. * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2) @@ -1173,42 +1318,52 @@ __kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc( * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication. * If not, the quantization will be performed using a fixed point multiplication * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8 - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8 - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector - * @param[in] max_offset The maximum allowed offset for the input tensor + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] max_offset The maximum allowed offset for the input tensor */ -__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( +__kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), IMAGE_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts), #if defined(HAS_BIAS) VECTOR_DECLARATION(biases), #endif // defined(HAS_BIAS) @@ -1220,7 +1375,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y; @@ -1228,7 +1383,7 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( #if defined(DST_DEPTH) __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w; #else /* defined(DST_DEPTH) */ - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE; #endif /* defined(DST_DEPTH) */ int z_coord = 0; @@ -1250,16 +1405,19 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( VEC_INT sum1 = 0; // Load weights - uchar16 w0 = VLOAD(16)(0, weights_addr); - uchar16 w1 = VLOAD(16)(0, weights_addr + 16); - uchar4 w2 = VLOAD(4)(0, weights_addr + 32); + VEC_TYPE(16) + w0 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr)); + VEC_TYPE(16) + w1 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16)); + VEC_TYPE(4) + w2 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 32)); #if INPUT_OFFSET != 0 // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8); - DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); + DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA); - DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); + DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); // Multiply the weights reduction with INPUT_OFFSET acc0 = INPUT_OFFSET * acc0; @@ -1277,30 +1435,42 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( offset = y_offset + (int4)(z_coord * src_stride_z); offset = min(offset, (int4)max_offset); - VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + VEC_TYPE(VEC_SIZE) + values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 1 // z_coord can be only negative for z = 0 so we do not need to clamp it // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset - z_coord = z - (int)CONV_PAD_TOP + 1; - offset = y_offset + (int4)(z_coord * src_stride_z); - VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + z_coord = z - (int)CONV_PAD_TOP + 1; + offset = y_offset + (int4)(z_coord * src_stride_z); + VEC_TYPE(VEC_SIZE) + values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); // z == 2 // After z = 1 we can simply add src_stride_z to offset without updating z_coord // However offset can be out-of-bound so we need to check if it is greater than max_offset offset += (int4)src_stride_z; - offset = min(offset, (int4)max_offset); - VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0); - VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1); - VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2); - VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3); + offset = min(offset, (int4)max_offset); + VEC_TYPE(VEC_SIZE) + values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0)); + VEC_TYPE(VEC_SIZE) + values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1)); + VEC_TYPE(VEC_SIZE) + values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2)); + VEC_TYPE(VEC_SIZE) + values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3)); DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0); DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0); @@ -1309,8 +1479,8 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1); DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1); - DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); - DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); + DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); + DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1); DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2); DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2); @@ -1319,8 +1489,8 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3); DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3); - DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); - DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); + DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); + DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3); #if defined(HAS_BIAS) Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); @@ -1349,19 +1519,20 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT); #else // defined(REAL_MULTIPLIER) + const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); + const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); - acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); - acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT); + acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift); + acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift); #endif // defined(REAL_MULTIPLIER) acc0 += (VEC_INT)OUTPUT_OFFSET; acc1 += (VEC_INT)OUTPUT_OFFSET; - VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR); - VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR); - - res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255); - res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255); + VEC_TYPE(VEC_SIZE) + res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE)); + VEC_TYPE(VEC_SIZE) + res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE)); #if defined(DST_DEPTH) __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w; @@ -1370,9 +1541,9 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( #endif /* defined(DST_DEPTH) */ VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y); + (ACTIVATION_FUNC(res0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)); VSTORE(VEC_SIZE) - (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y); + (ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)); } #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4 @@ -1380,9 +1551,11 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( #endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) -#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) +#endif // defined(WEIGHTS_PROMOTED_TYPE) -#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) +#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER)) + +#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped * * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2) @@ -1398,43 +1571,53 @@ __kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc( * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8 - * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr - * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) - * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32 + * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes) + * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector + * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32 + * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes) + * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector + * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector */ __kernel void dwc_MxN_native_quantized8_nhwc( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst), - TENSOR3D_DECLARATION(weights) + TENSOR3D_DECLARATION(weights), + VECTOR_DECLARATION(output_multipliers), + VECTOR_DECLARATION(output_shifts) #if defined(HAS_BIAS) , VECTOR_DECLARATION(biases) @@ -1447,19 +1630,30 @@ __kernel void dwc_MxN_native_quantized8_nhwc( int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) - __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(uchar) * (int)N0; + __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0; - __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(uchar) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z; + __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z; - __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(uchar) * (int)DEPTH_MULTIPLIER * (int)N0; + __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0; #if defined(HAS_BIAS) __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; #endif // defined(HAS_BIAS) +#if defined(PER_CHANNEL_QUANTIZATION) + __global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; + __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; + + VEC_INT output_multiplier = (VEC_INT)0; + VEC_INT output_shift = (VEC_INT)0; +#else // defined(PER_CHANNEL_QUANTIZATION) + const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes)); + const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes)); +#endif // defined(PER_CHANNEL_QUANTIZATION) + #if defined(DST_DEPTH) s_addr += b * src_stride_w; d_addr += b * dst_stride_w; @@ -1489,8 +1683,8 @@ __kernel void dwc_MxN_native_quantized8_nhwc( int w_offset = xk * weights_stride_y + yk * weights_stride_z; // Load input and weights values - VEC_SHORT i = CONVERT(VLOAD(N0)(0, (__global uchar *)(s_addr + s_offset)), VEC_SHORT); - VEC_SHORT w = CONVERT(VLOAD(N0)(0, (__global uchar *)(w_addr + w_offset)), VEC_SHORT); + VEC_SHORT i = CONVERT(VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset)), VEC_SHORT); + VEC_SHORT w = CONVERT(VLOAD(N0)(0, (__global WEIGHTS_TYPE *)(w_addr + w_offset)), VEC_SHORT); res += (i + (VEC_SHORT)INPUT_OFFSET) * (w + (VEC_SHORT)WEIGHTS_OFFSET); } @@ -1505,21 +1699,32 @@ __kernel void dwc_MxN_native_quantized8_nhwc( res += bias; #endif // defined(HAS_BIAS) - res = CONVERT(ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(CONVERT(res, VEC_INT), OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0), VEC_SHORT); +#if defined(PER_CHANNEL_QUANTIZATION) + output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr)); + output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr)); +#endif // defined(PER_CHANNEL_QUANTIZATION) + + res = CONVERT(ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(CONVERT(res, VEC_INT), output_multiplier, output_shift, N0), VEC_SHORT); res += (VEC_SHORT)OUTPUT_OFFSET; - VEC_UCHAR res1 = CONVERT_SAT(res, VEC_UCHAR); + VEC_TYPE(VEC_SIZE) + res1 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE)); VSTORE(N0) - (ACTIVATION_FUNC(res1), 0, (__global uchar *)(d_addr)); + (ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(d_addr)); #if DEPTH_MULTIPLIER > 1 - w_addr += sizeof(uchar); - d_addr += sizeof(uchar); + w_addr += sizeof(WEIGHTS_TYPE); + d_addr += sizeof(DATA_TYPE); +#if defined(PER_CHANNEL_QUANTIZATION) + out_mul_addr += sizeof(int); + out_shift_addr += sizeof(int); +#endif // defined(PER_CHANNEL_QUANTIZATION) #if defined(HAS_BIAS) b_addr += sizeof(int); #endif // defined(HAS_BIAS) } #endif // DEPTH_MULTIPLIER > 1 } -#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) +#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) +#endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE) diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h index 57ecccc2b2..f115602a1a 100644 --- a/src/core/CL/cl_kernels/helpers_asymm.h +++ b/src/core/CL/cl_kernels/helpers_asymm.h @@ -93,16 +93,18 @@ inline float dequantize_qasymm8(uchar input, float offset, float scale) * * @return Correctly-rounded-to-nearest division by a power-of-two. */ -#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - VEC_DATA_TYPE(int, size) \ - mask = (1 << exponent) - 1; \ - const VEC_DATA_TYPE(int, size) zero = 0; \ - const VEC_DATA_TYPE(int, size) one = 1; \ - VEC_DATA_TYPE(int, size) \ - threshold = (mask >> 1) + select(zero, one, x < 0); \ - return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) \ + zero = (VEC_DATA_TYPE(int, size))0; \ + const VEC_DATA_TYPE(int, size) \ + one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + mask = (one << exponent) - one; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, x < 0); \ + return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ } /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp index f232f6cfc0..e883e8f250 100644 --- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp +++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp @@ -113,21 +113,7 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast(channels - vec_size), 0))); - - switch(input->info()->element_size()) - { - case 1: - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - case 2: - build_opts.add_option("-DDATA_TYPE=ushort"); - break; - case 4: - build_opts.add_option("-DDATA_TYPE=uint"); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - } + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); // Create kernel std::string kernel_name = "channel_shuffle_" + lower_string(string_from_data_layout(data_layout)); diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp index 42e5fbc8f2..a2f4a913ce 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp @@ -37,13 +37,15 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, const Size2D dilation) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D dilation, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -52,7 +54,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC), "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3); @@ -74,28 +75,43 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); } - if(output->total_size() != 0) + if(is_qasymm) { - const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - } + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(is_qasymm) + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != output_shifts->dimension(0)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0)); + } + } + else { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - ARM_COMPUTE_UNUSED(multiplier); - ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + if(output->total_size() != 0) + { + const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation) +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, + unsigned int depth_multiplier, GPUTarget gpu_target, std::string &kernel_name, const Size2D dilation) { // Output auto inizialitation if not yet initialized const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); @@ -182,9 +198,9 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } else { - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_data_type_quantized_per_channel(weights->data_type()); - kernel_name = is_qasymm ? "dwc_3x3_native_qasymm8" : "depthwise_convolution_3x3"; + kernel_name = is_qasymm ? "dwc_3x3_native_quantized8" : "depthwise_convolution_3x3"; kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : ""); kernel_name += (is_qasymm ? "_nchw" : ""); @@ -224,23 +240,28 @@ BorderSize CLDepthwiseConvolutionLayer3x3NCHWKernel::border_size() const return _border_size; } -void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, + const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation)); - - bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type()); - - _input = input; - _output = output; - _weights = weights; - _biases = biases; - _conv_stride_x = conv_info.stride().first; - _conv_stride_y = conv_info.stride().second; - _conv_pad_left = conv_info.pad_left(); - _conv_pad_top = conv_info.pad_top(); - _border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + conv_info, depth_multiplier, act_info, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, + (output_shifts != nullptr) ? output_shifts->info() : nullptr)); + + _input = input; + _output = output; + _weights = weights; + _biases = biases; + _conv_stride_x = conv_info.stride().first; + _conv_stride_y = conv_info.stride().second; + _conv_pad_left = conv_info.pad_left(); + _conv_pad_top = conv_info.pad_top(); + _border_size = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left); + _output_multipliers = output_multipliers; + _output_shifts = output_shifts; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); // Configure kernel window std::string kernel_name; @@ -260,24 +281,21 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS"); - if(is_qasymm) + if(_is_quantized) { const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform(); const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform(); - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel; build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y)); build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset)); build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset)); - build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); + build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8"); if(act_info.enabled()) { @@ -293,6 +311,10 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1)); build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1)); } + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type())); + build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type())); } else { @@ -323,12 +345,15 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, _config_id += support::cpp11::to_string(output->info()->dimension(1)); } -Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation) +Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, + const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { std::string kernel_name; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, gpu_target, kernel_name, dilation).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), + conv_info, depth_multiplier, gpu_target, kernel_name, dilation) + .first); return Status{}; } @@ -353,18 +378,28 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::Com slice_weights.set_dimension_step(Window::DimX, 0); slice_weights.set_dimension_step(Window::DimY, 0); + unsigned int idx = 3 * num_arguments_per_3D_tensor(); + + // Set output multipliers in case of quantized data type + if(_is_quantized) + { + Window slice; + slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape()); + add_1D_tensor_argument(idx, _output_multipliers, slice); + add_1D_tensor_argument(idx, _output_shifts, slice); + } + // Set biases if(_biases != nullptr) { - unsigned int idx = 3 * num_arguments_per_3D_tensor(); - Window slice_biases; + Window slice_biases; slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape()); add_1D_tensor_argument(idx, _biases, slice_biases); } do { - unsigned int idx = 0; + idx = 0; add_3D_tensor_argument(idx, _input, slice_in); add_3D_tensor_argument(idx, _output, slice_out); add_3D_tensor_argument(idx, _weights, slice_weights); @@ -373,3 +408,4 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::Com } while(collapsed.slide_window_slice_3D(slice_out) && collapsed_in.slide_window_slice_3D(slice_in)); } +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp index b8b144dbfa..d5f37f32ce 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp @@ -41,17 +41,18 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, const Size2D &dilation) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::QASYMM8) + && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC), "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1); @@ -63,26 +64,47 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const size_t weights_width = 3; const size_t weights_height = 3; + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape( + *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation); if(is_qasymm) { DepthwiseConvolutionReshapeInfo info; info.c0 = 4; ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(0) / info.c0) != weights_width * weights_height); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); + + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape[0] != output_shifts->dimension(0)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0)); + } } else { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(1) != weights_width) || (weights->dimension(2) != weights_height)); } if(biases != nullptr) { + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[0]); if(is_qasymm) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } else { - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); } @@ -91,27 +113,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, if(output->total_size() != 0) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape( - *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } - if(is_qasymm) - { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; - - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - ARM_COMPUTE_UNUSED(multiplier); - ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); - } - return Status{}; } std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, + ITensorInfo *output_multipliers, ITensorInfo *output_shifts) { const size_t weights_width = 3; const size_t weights_height = 3; @@ -144,7 +154,17 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen if(is_qasymm) { - window_changed = update_window_and_padding(win, input_access, output_access); + if((output_multipliers != nullptr) && (output_shifts != nullptr)) + { + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_accessed_per_iteration); + AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_accessed_per_iteration); + window_changed = window_changed || update_window_and_padding(win, input_access, output_access, output_multipliers_access, output_shifts_access); + } + else + { + Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input"); + return std::make_pair(err, win); + } } else { @@ -157,7 +177,6 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen AccessWindowHorizontal bias_access(bias, 0, num_elems_accessed_per_iteration); window_changed = window_changed || update_window_and_padding(win, bias_access); } - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -175,19 +194,26 @@ BorderSize CLDepthwiseConvolutionLayer3x3NHWCKernel::border_size() const return _border_size; } -void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, + const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation)); - auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + conv_info, depth_multiplier, act_info, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, + (output_shifts != nullptr) ? output_shifts->info() : nullptr)); + auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + conv_info, depth_multiplier, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, + (output_shifts != nullptr) ? output_shifts->info() : nullptr); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - const bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type()); const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1); - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel; _input = input; _output = output; @@ -196,16 +222,19 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, _conv_stride_y = conv_info.stride().second; _num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1; _num_planes_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1; + _output_multipliers = output_multipliers; + _output_shifts = output_shifts; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1 - if(is_dot8_supported && is_qasymm) + if(is_dot8_supported && _is_quantized) { _num_planes_processed_per_iteration = 1; } - _border_size = BorderSize(is_qasymm && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0); + _border_size = BorderSize(_is_quantized && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0); - const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->info()->element_size()); + const unsigned int num_elems_accessed_per_iteration = _is_quantized ? 4 : (8 / input->info()->element_size()); CLBuildOptions build_opts; build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); @@ -217,24 +246,19 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); - if(is_qasymm) + if(_is_quantized) { const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform(); const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform(); - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1))); build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset)); build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * iq_info.offset * wq_info.offset)); - build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_opts.add_option_if(is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); + build_opts.add_option_if(is_dot8_supported, "-DIS_DOT8"); if(act_info.enabled()) { @@ -250,6 +274,10 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1)); build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1)); } + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type())); + build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type())); } else { @@ -274,9 +302,9 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, std::string kernel_name; // Create kernel - if(is_qasymm) + if(_is_quantized) { - kernel_name = std::string("dwc_3x3_reshaped_qasymm8"); + kernel_name = std::string("dwc_3x3_reshaped_quantized8"); kernel_name += (is_dot8_supported && is_stride_1_dilation_1 ? "_dot8" : ""); kernel_name += (is_stride_1_dilation_1 ? "_stride1" : ""); kernel_name += "_nhwc"; @@ -309,13 +337,16 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, _config_id += string_from_data_type(input->info()->data_type()); } -Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) +Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, + const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, output_multipliers, output_shifts)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), biases != nullptr ? biases->clone().get() : nullptr, - output->clone().get(), conv_info, depth_multiplier, dilation) + output->clone().get(), conv_info, depth_multiplier, dilation, + (output_multipliers != nullptr) ? output_multipliers->clone().get() : nullptr, + (output_shifts != nullptr) ? output_shifts->clone().get() : nullptr) .first); return Status{}; @@ -329,7 +360,6 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com // Collapse window Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3); - const bool is_qasymm = is_data_type_quantized_asymmetric(_input->info()->data_type()); Window win = window_collapsed; win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast(_num_planes_processed_per_iteration)) * total_batches, 1)); @@ -344,7 +374,16 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com Window slice_in = win_in.first_slice_window_4D(); Window slice_out = win.first_slice_window_4D(); - unsigned int idx = 2 * num_arguments_per_4D_tensor() + (is_qasymm ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); + unsigned int idx = 2 * num_arguments_per_4D_tensor() + (_is_quantized ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); + + if(_is_quantized) + { + Window slice; + slice.use_tensor_dimensions(_output_multipliers->info()->tensor_shape()); + slice.set_dimension_step(Window::DimX, window.x().step()); + add_1D_tensor_argument(idx, _output_multipliers, slice); + add_1D_tensor_argument(idx, _output_shifts, slice); + } if(_biases != nullptr) { @@ -398,7 +437,7 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com unsigned int idx = 0; add_4D_tensor_argument(idx, _input, slice_in); add_4D_tensor_argument(idx, _output, slice_out); - if(is_qasymm) + if(_is_quantized) { add_2D_tensor_argument(idx, _weights, slice_out); } diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 2115fc614d..3fc236eaa7 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -42,13 +42,13 @@ namespace arm_compute namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_UNUSED(dwc_info); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1 && dwc_weights_info.n0 != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().second < 1); @@ -57,24 +57,53 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_UNUSED(idx_c); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * depth_multiplier)); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); + + const bool is_quantized = is_data_type_quantized(input->data_type()); + if(biases != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_data_type_quantized(input->data_type())) + if(is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + } + } + + if(is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); + + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_shifts->dimension(0)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_multipliers->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(1 != output_shifts->dimension(0)); } } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } if(output->total_size() != 0) { - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } @@ -82,7 +111,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, + ITensorInfo *output_multipliers, ITensorInfo *output_shifts) { ARM_COMPUTE_UNUSED(dwc_info); @@ -113,6 +143,21 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen window_changed = update_window_and_padding(win, input_access, weights_access, output_access); } + if(is_data_type_quantized(input->data_type())) + { + if((output_multipliers != nullptr) && (output_shifts != nullptr)) + { + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, n0); + AccessWindowHorizontal output_shifts_access(output_shifts, 0, n0); + window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access); + } + else + { + Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input"); + return std::make_pair(err, win); + } + } + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; @@ -121,32 +166,44 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel() - : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _depth_multiplier(1) + : _input(nullptr), + _weights(nullptr), + _biases(nullptr), + _output(nullptr), + _depth_multiplier(1), + _output_multipliers(nullptr), + _output_shifts(nullptr), + _is_quantized(false) { } void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, + const ICLTensor *output_multipliers, const ICLTensor *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, - dilation)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr)); - auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, - dilation); + auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, + (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - _input = input; - _output = output; - _weights = weights; - _biases = biases; - _depth_multiplier = depth_multiplier; + _input = input; + _output = output; + _weights = weights; + _biases = biases; + _depth_multiplier = depth_multiplier; + _output_multipliers = output_multipliers; + _output_shifts = output_shifts; + _is_quantized = is_data_type_quantized(input->info()->data_type()); const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); const size_t weights_width = weights->info()->dimension(idx_w); const size_t weights_height = weights->info()->dimension(idx_h); - const bool is_quantized = is_data_type_quantized(input->info()->data_type()); CLBuildOptions build_opts; build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS"); @@ -166,24 +223,18 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); - std::string kernel_name = (is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc"; + std::string kernel_name = (_is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc"; - if(is_quantized) + if(_is_quantized) { const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform(); const UniformQuantizationInfo wq_info = _weights->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform(); - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iq_info.offset)); build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wq_info.offset)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oq_info.offset)); - build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); - build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_opts.add_option_if(is_data_type_quantized_per_channel(weights->info()->data_type()), "-DPER_CHANNEL_QUANTIZATION"); if(dwc_info.activation_info.enabled()) { @@ -199,6 +250,9 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1)); build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1)); } + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type())); } else { @@ -228,12 +282,15 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, } Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation) + const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, + unsigned int depth_multiplier, const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), biases != nullptr ? biases->clone().get() : nullptr, - output->clone().get(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation) + output->clone().get(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, + output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, + output_shifts != nullptr ? output_shifts->clone().get() : nullptr) .first); return Status{}; @@ -255,15 +312,23 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm slice_out.set(Window::DimX, Window::Dimension(0, _input->info()->tensor_shape()[0], 1)); } + unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor(); + + // Set output multipliers in case of quantized data type + if(_is_quantized) + { + add_1D_tensor_argument(idx, _output_multipliers, slice_in); + add_1D_tensor_argument(idx, _output_shifts, slice_in); + } + if(_biases != nullptr) { - unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor(); add_1D_tensor_argument(idx, _biases, slice_in); } do { - unsigned int idx = 0; + idx = 0; add_4D_tensor_argument(idx, _input, slice_in); add_4D_tensor_argument(idx, _output, slice_out); add_3D_tensor_argument(idx, _weights, slice_out); diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp index 1fd6312295..ec889ec949 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp @@ -47,7 +47,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON(info.c0 != 4); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_h) != 3); @@ -98,10 +97,10 @@ void CLDepthwiseConvolutionLayerReshapeWeightsKernel::configure(const ICLTensor // Build the kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(info.c0)); build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(0))); build_opts.add_option_if(info.transpose, "-DTRANSPOSE"); + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); _kernel = static_cast(CLKernelLibrary::get().create_kernel("depthwise_convolution_reshape_weights", build_opts.options())); } diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp index 72f2ca40f5..7010dffd25 100644 --- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp +++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp @@ -37,7 +37,8 @@ #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; namespace @@ -139,21 +140,7 @@ void CLGEMMReshapeLHSMatrixKernel::configure(const ICLTensor *input, ICLTensor * build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1))); build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2))); - - switch(input->info()->element_size()) - { - case 1: - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - case 2: - build_opts.add_option("-DDATA_TYPE=ushort"); - break; - case 4: - build_opts.add_option("-DDATA_TYPE=uint"); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - } + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); std::string kernel_name("gemm_reshape_lhs_matrix_"); kernel_name += lhs_info.transpose ? "t" : "nt"; @@ -219,4 +206,5 @@ void CLGEMMReshapeLHSMatrixKernel::run(const Window &window, cl::CommandQueue &q enqueue(queue, *this, slice, lws_hint()); } while(window.slide_window_slice_3D(slice)); -} \ No newline at end of file +} +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp index 2ca4132b15..6f6019d26a 100644 --- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp +++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp @@ -37,7 +37,8 @@ #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; namespace @@ -118,21 +119,7 @@ void CLGEMMReshapeRHSMatrixKernel::configure(const ICLTensor *input, ICLTensor * build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE"); build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE"); build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); - - switch(input->info()->element_size()) - { - case 1: - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - case 2: - build_opts.add_option("-DDATA_TYPE=ushort"); - break; - case 4: - build_opts.add_option("-DDATA_TYPE=uint"); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - } + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); std::string kernel_name("gemm_reshape_rhs_matrix_"); kernel_name += rhs_info.transpose ? "t" : "nt"; @@ -169,4 +156,5 @@ void CLGEMMReshapeRHSMatrixKernel::run(const Window &window, cl::CommandQueue &q enqueue(queue, *this, slice, lws_hint()); } while(window.slide_window_slice_3D(slice)); -} \ No newline at end of file +} +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp index ea292c0b7b..85917d38dd 100644 --- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp +++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp @@ -40,7 +40,8 @@ #include -using namespace arm_compute; +namespace arm_compute +{ namespace { std::pair validate_and_configure_window(ITensorInfo *input, unsigned int height_offset, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration) @@ -102,31 +103,7 @@ void CLHeightConcatenateLayerKernel::configure(const ICLTensor *input, unsigned // Add build options CLBuildOptions build_opts; - - switch(input->info()->element_size()) - { - case 1: - { - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - } - case 2: - { - build_opts.add_option("-DDATA_TYPE=short"); - break; - } - case 4: - { - build_opts.add_option("-DDATA_TYPE=int"); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported input data type."); - break; - } - } - + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)); build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset)); build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); @@ -164,3 +141,4 @@ void CLHeightConcatenateLayerKernel::run(const Window &window, cl::CommandQueue add_4D_tensor_argument(idx, _output, window); enqueue(queue, *this, window, lws_hint()); } +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp index 9cb72b3c04..81a810fcb8 100644 --- a/src/core/CL/kernels/CLPermuteKernel.cpp +++ b/src/core/CL/kernels/CLPermuteKernel.cpp @@ -52,11 +52,6 @@ TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector & Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, - DataType::U16, DataType::S16, - DataType::U32, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 1 || input->num_dimensions() > 4, "Permutation upto 4-D input tensor is supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4, diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp index 84bf5bf874..796f0d068a 100644 --- a/src/core/CL/kernels/CLReverseKernel.cpp +++ b/src/core/CL/kernels/CLReverseKernel.cpp @@ -81,20 +81,7 @@ void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const // Set kernel build options CLBuildOptions build_opts; build_opts.add_option("-DNUM_REVERSE_DIMS=" + support::cpp11::to_string(axis->info()->dimension(0))); - switch(input->info()->element_size()) - { - case 1: - build_opts.add_option("-DDATA_TYPE=uchar"); - break; - case 2: - build_opts.add_option("-DDATA_TYPE=ushort"); - break; - case 4: - build_opts.add_option("-DDATA_TYPE=uint"); - break; - default: - ARM_COMPUTE_ERROR("Data type not supported"); - } + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel("reverse", build_opts.options())); diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index d9e05d7ee8..7e1af0e27d 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -292,6 +292,7 @@ std::string arm_compute::string_from_pixel_value(const PixelValue &value, const converted_string = ss.str(); break; case DataType::S8: + case DataType::QSYMM8_PER_CHANNEL: // Needs conversion to 32 bit, otherwise interpreted as ASCII values ss << int32_t(value.get()); converted_string = ss.str(); @@ -437,6 +438,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const print_consecutive_elements_impl(s, ptr, n, stream_width, element_delim); break; case DataType::S8: + case DataType::QSYMM8_PER_CHANNEL: print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); break; case DataType::U16: @@ -473,6 +475,7 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp case DataType::QASYMM8_PER_CHANNEL: return max_consecutive_elements_display_width_impl(s, ptr, n); case DataType::S8: + case DataType::QSYMM8_PER_CHANNEL: return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); case DataType::U16: case DataType::QASYMM16: diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index 42bd84db47..cdd48972eb 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Helpers.h" #include #include @@ -134,5 +135,26 @@ std::pair get_min_max_values_from_quantized_data_type(DataType data_ty } return std::make_pair(min_quant_val, max_quant_val); } +void compute_quantized_multipliers_and_shifts(const ITensor *input, const ITensor *weights, const ITensor *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr) +{ + const unsigned int idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int num_filters = is_data_type_quantized_per_channel(weights->info()->data_type()) ? weights->info()->dimension(idx_c) : 1; + + const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); + const QuantizationInfo wq_info = weights->info()->quantization_info(); + const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); + + for(unsigned int i = 0; i < num_filters; ++i) + { + int output_multiplier = 0; + int output_shift = 0; + const float multiplier = iq_info.scale * wq_info.scale()[i] / oq_info.scale; + ARM_COMPUTE_ERROR_ON(multiplier > 1.0f); + calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + + output_multipliers_ptr[i] = output_multiplier; + output_shifts_ptr[i] = output_shift; + } +} } // quantization } // arm_compute diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index 168d7d5c84..cdf3a95568 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -45,6 +45,7 @@ Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weigh { // This function should be removed and incorporated inside CLDepthwiseConvolutionLayerInternal3x3 once CLDepthwiseConvolutionLayer3x3 is properly removed ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); @@ -58,15 +59,20 @@ Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weigh info.c0 = 4; info.transpose = is_stride_1_dilation_1 && is_dot8_supported; + TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); if(is_quantized) { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform(); + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); - const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - ARM_COMPUTE_UNUSED(multiplier); - ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c))); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } } if(needs_permute) @@ -83,25 +89,29 @@ Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weigh const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW); const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target, - dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, + conv_info, depth_multiplier, act_info, gpu_target, + dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); } else if(is_nhwc) { if(needs_weights_reshape) { auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info); - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier, - act_info, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, + output, conv_info, depth_multiplier, act_info, + dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); } } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, + dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); } return Status{}; } @@ -143,9 +153,14 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConv _permuted_input(), _permuted_weights(), _permuted_output(), + _output_multipliers(), + _output_shifts(), _original_weights(), + _input(), + _output(), _needs_permute(false), - _is_prepared(false) + _is_prepared(false), + _is_quantized(false) { } @@ -162,8 +177,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( act_info, dilation)); + _is_quantized = is_data_type_quantized(input->info()->data_type()); _is_prepared = false; _original_weights = weights; + _input = input; + _output = output; _needs_permute = input->info()->data_layout() == DataLayout::NCHW; ICLTensor *input_to_use = input; @@ -190,11 +208,27 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( output_to_use = &_permuted_output; } + CLTensor *output_multipliers_to_use = nullptr; + CLTensor *output_shifts_to_use = nullptr; + if(_is_quantized) + { + const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); + const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1; + + _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + + output_multipliers_to_use = &_output_multipliers; + output_shifts_to_use = &_output_shifts; + } + DWCWeightsKernelInfo dwc_weights_info; dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1; DWCKernelInfo dwc_info; dwc_info.activation_info = act_info; - _dwc_native_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation); + _dwc_native_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, + dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, + output_multipliers_to_use, output_shifts_to_use); if(_needs_permute) { @@ -205,6 +239,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure( _permute_output_to_nchw.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U)); _permuted_output.allocator()->allocate(); } + + if(_is_quantized) + { + _output_multipliers.allocator()->allocate(); + _output_shifts.allocator()->allocate(); + } } Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, @@ -225,6 +265,24 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate const bool needs_permute = input->data_layout() == DataLayout::NCHW; + const bool is_quantized = is_data_type_quantized(input->data_type()); + + TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32)); + if(is_quantized) + { + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + + const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c))); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } + } + if(needs_permute) { TensorShape permuted_input_shape = input->tensor_shape(); @@ -242,12 +300,14 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, dwc_weights_info, - dwc_info, conv_info, depth_multiplier, dilation)); + dwc_info, conv_info, depth_multiplier, dilation, + &output_multipliers_shifts_info, &output_multipliers_shifts_info)); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, + dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); } return Status{}; } @@ -273,6 +333,19 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() { if(!_is_prepared) { + if(_is_quantized) + { + _output_multipliers.map(); + _output_shifts.map(); + quantization::compute_quantized_multipliers_and_shifts(_input, + _original_weights, + _output, + reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), + reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); + _output_multipliers.unmap(); + _output_shifts.unmap(); + } + if(_needs_permute) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); @@ -286,40 +359,63 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() } CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _kernel(nullptr), _border_handler(), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), _reshape_weights(), _permuted_input(), - _permuted_weights(), _permuted_output(), _original_weights(nullptr), _needs_permute(false), _needs_weights_reshape(false), _is_prepared(false) + : _memory_group(std::move(memory_manager)), + _kernel(nullptr), + _border_handler(), + _permute_input_to_nchw(), + _permute_weights_to_nchw(), + _permute_output_to_nhwc(), + _reshape_weights(), + _permuted_input(), + _permuted_weights(), + _permuted_output(), + _output_multipliers(), + _output_shifts(), + _original_weights(nullptr), + _input(nullptr), + _output(nullptr), + _needs_permute(false), + _needs_weights_reshape(false), + _is_prepared(false), + _is_quantized(false) { } void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - // idx_w and idx_h only used for validation - const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_UNUSED(idx_w); - ARM_COMPUTE_UNUSED(idx_h); - - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_w) + (weights->info()->dimension(idx_w) - 1) * (dilation.x() - 1) > input->info()->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right()); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(idx_h) + (weights->info()->dimension(idx_h) - 1) * (dilation.y() - 1) > input->info()->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom()); - - const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC; + const GPUTarget gpu_target = CLScheduler::get().target(); + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer3x3::validate(input->info(), + weights->info(), + biases != nullptr ? biases->info() : nullptr, + output->info(), + conv_info, + depth_multiplier, + act_info, + gpu_target, + dilation)); + + const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); _needs_permute = is_nhwc && (depth_multiplier > 1); - _needs_weights_reshape = is_nhwc && (depth_multiplier == 1) - && is_data_type_quantized_asymmetric(input->info()->data_type()); + _needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && _is_quantized; + _is_prepared = false; _original_weights = weights; + _input = input; + _output = output; ICLTensor *input_to_use = input; const ICLTensor *weights_to_use = weights; ICLTensor *output_to_use = output; - const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); - const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); - const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1); + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); + const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel; + const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1); DepthwiseConvolutionReshapeInfo info; info.c0 = 4; @@ -359,9 +455,30 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config _kernel = arm_compute::support::cpp14::make_unique(); } + CLTensor *output_multipliers_to_use = nullptr; + CLTensor *output_shifts_to_use = nullptr; + if(_is_quantized) + { + const size_t idx_c = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL); + const size_t num_filters = (is_quantized_per_channel) ? weights->info()->dimension(idx_c) : 1; + + _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + + output_multipliers_to_use = &_output_multipliers; + output_shifts_to_use = &_output_shifts; + } + // Configure kernel - _kernel->set_target(CLScheduler::get().target()); - _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info, dilation); + _kernel->set_target(gpu_target); + _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, + act_info, dilation, output_multipliers_to_use, output_shifts_to_use); + + if(_is_quantized) + { + _output_multipliers.allocator()->allocate(); + _output_shifts.allocator()->allocate(); + } // Permute output if needed if(_needs_permute) @@ -412,6 +529,19 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepar { if(!_is_prepared) { + if(_is_quantized) + { + _output_multipliers.map(); + _output_shifts.map(); + quantization::compute_quantized_multipliers_and_shifts(_input, + _original_weights, + _output, + reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), + reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); + _output_multipliers.unmap(); + _output_shifts.unmap(); + } + if(_needs_permute) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp index 37563046cc..5d8fd200d3 100644 --- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp @@ -553,6 +553,8 @@ TEST_SUITE_END() // Float template using CLDepthwiseConvolutionLayerQuantizedFixture = DepthwiseConvolutionLayerValidationQuantizedFixture; +template +using CLDepthwiseConvolutionLayerQuantizedPerChannelFixture = DepthwiseConvolutionLayerValidationQuantizedPerChannelFixture; TEST_SUITE(Quantized) TEST_SUITE(QASYMM8) @@ -653,6 +655,113 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("SrcDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(), + large_depth_multipliers), + framework::dataset::make("SrcDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.7f, 2) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE(Dilation) +FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("SrcDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.8, 1) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(), + large_depth_multipliers), + framework::dataset::make("SrcDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.9f, 11) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // Dilation +TEST_SUITE_END() // Generic +TEST_SUITE(W3x3) +FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), + depth_multipliers), + framework::dataset::make("SrcDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.3f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 4) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(), + large_depth_multipliers), + framework::dataset::make("SrcDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE(Dilation) +FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), + depth_multipliers), + framework::dataset::make("SrcDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedPerChannelFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(), + large_depth_multipliers), + framework::dataset::make("SrcDataType", DataType::QASYMM8)), + framework::dataset::make("WeightsDataType", DataType::QSYMM8_PER_CHANNEL)), + framework::dataset::make("SrcQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DstQuantizationInfo", { QuantizationInfo(0.5f, 10) })), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // Dilation +TEST_SUITE_END() // W3x3 +TEST_SUITE_END() // QSYMM8_PER_CHANNEL TEST_SUITE_END() // Quantized TEST_SUITE_END() // DepthwiseConvolutionLayer -- cgit v1.2.1