From 99ac60bca77e9977c844cc1293751d63ddc3065c Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Fri, 16 Feb 2018 15:17:23 +0000 Subject: COMPMID-853 Fuse CL DepthwiseConvolution with Activation for QASYM8 Change-Id: I287908f76af458ad4b4d865d353dc37e33877250 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120839 Tested-by: Jenkins Reviewed-by: Anthony Barbier Reviewed-by: Georgios Pinitas --- .../kernels/CLDepthwiseConvolutionLayer3x3Kernel.h | 3 +- .../CL/functions/CLDepthwiseConvolutionLayer.h | 3 +- src/core/CL/cl_kernels/activation_layer.cl | 6 +++- src/core/CL/cl_kernels/activation_layer_qa8.cl | 34 +++++++++++++++------- src/core/CL/cl_kernels/batchnormalization_layer.cl | 13 ++++----- .../cl_kernels/depthwise_convolution_quantized.cl | 13 +++++++-- .../CL/kernels/CLBatchNormalizationLayerKernel.cpp | 2 +- .../CLDepthwiseConvolutionLayer3x3Kernel.cpp | 30 ++++++++++++++++++- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 4 +-- 9 files changed, 80 insertions(+), 28 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h index fb51781ecf..84bc09d1fb 100644 --- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h +++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h @@ -53,8 +53,9 @@ public: * Data type supported: Same as @p input. * @param[out] output Destination tensor. Data type supported: Same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported. */ - void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h index c348e3a09c..eb12fe4cca 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h @@ -58,8 +58,9 @@ public: * Data type supported: Same as @p input. * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported. */ - void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); + void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overriden: void run() override; diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl index 4424a66b61..a8ea7387d6 100644 --- a/src/core/CL/cl_kernels/activation_layer.cl +++ b/src/core/CL/cl_kernels/activation_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -115,6 +115,8 @@ inline TYPE linear_op(TYPE x) #define ACTIVATION_OP2(op, x) op##_op(x) #define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) +#if defined(ACT) + /** This performs an activation function floating point inputs. * * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time @@ -168,3 +170,5 @@ __kernel void activation_layer( VSTORE(VEC_SIZE) (data, 0, (__global DATA_TYPE *)output.ptr); } + +#endif /* defined(ACT) */ \ No newline at end of file diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl index cb31e99efb..66e54ed6ad 100644 --- a/src/core/CL/cl_kernels/activation_layer_qa8.cl +++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl @@ -44,6 +44,26 @@ inline TYPE lu_brelu_op(TYPE x) #define ACTIVATION_OP2(op, x) op##_op(x) #define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) +#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) +#define PERFORM_ACTIVATION_QA8(act, data) \ + ({ \ + data = ACTIVATION_OP(act, data); \ + \ + VEC_DATA_TYPE(float, VEC_SIZE) \ + fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \ + \ + fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \ + data = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE)); \ + }) +#else /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */ +#define PERFORM_ACTIVATION_QA8(act, data) \ + ({ \ + data = ACTIVATION_OP(act, data); \ + }) +#endif /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */ + +#if defined(ACT) + /** This performs an activation function on QASYMM8 inputs. * * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time @@ -92,19 +112,11 @@ __kernel void activation_layer_qa8( // Load data TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr); - // Perform activation - data = ACTIVATION_OP(ACT, data); - -#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) - // requantize to output space - VEC_DATA_TYPE(float, VEC_SIZE) - fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); - - fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); - data = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE)); -#endif // defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) + data = PERFORM_ACTIVATION_QA8(ACT, data); // Store result VSTORE(VEC_SIZE) (data, 0, (__global DATA_TYPE *)output.ptr); } + +#endif /* defined(ACT) */ \ No newline at end of file diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl index 5ddeb1a6a1..0b61b5638c 100644 --- a/src/core/CL/cl_kernels/batchnormalization_layer.cl +++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl @@ -44,15 +44,12 @@ #endif /* FIXED_POINT_POSITION */ -#if defined(LU_BRELU) -#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL) -#elif defined(BRELU) -#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)0, (DATA_TYPE)A_VAL) -#elif defined(RELU) -#define ACTIVATION_FUNC(x) max(x, (DATA_TYPE)0) -#else /* FUSED_ACT */ +#if defined(FUSED_ACTIVATION) +#include "activation_layer.cl" +#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x) +#else /* defined(FUSED_ACTIVATION) */ #define ACTIVATION_FUNC(x) (x) -#endif /* FUSED_ACT */ +#endif /* defined(FUSED_ACTIVATION) */ /** Apply batch normalization. * diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index 40538a156d..21daee8230 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -26,6 +26,15 @@ #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) +#if defined(FUSED_ACTIVATION) +#define DATA_TYPE uchar +#define VEC_SIZE 8 +#include "activation_layer_qa8.cl" +#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x) +#else /* defined(FUSED_ACTIVATION) */ +#define ACTIVATION_FUNC(x) (x) +#endif /* defined(FUSED_ACTIVATION) */ + #if CONV_STRIDE_X > 3 #error "Stride X not supported" #endif /* CONV_STRIDE_X > 3 */ @@ -222,7 +231,7 @@ __kernel void depthwise_convolution_3x3_quantized( res0 = max(res0, (uchar8)0); res0 = min(res0, (uchar8)255); - vstore8(res0, 0, dst.ptr); + vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr); #if CONV_STRIDE_Y == 1 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8); @@ -231,7 +240,7 @@ __kernel void depthwise_convolution_3x3_quantized( res1 = max(res1, (uchar8)0); res1 = min(res1, (uchar8)255); - vstore8(res1, 0, dst.ptr + dst_stride_y); + vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y); #endif /* CONV_STRIDE_Y == 1 */ } diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp index 87fc1d097c..95c8250ee7 100644 --- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp @@ -136,7 +136,7 @@ void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *out CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option_if(act_info.enabled(), "-D" + string_from_activation_func(act_info.activation())); + build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation()))); build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp index 7a47bcc704..d50e4d695e 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp @@ -49,7 +49,8 @@ BorderSize CLDepthwiseConvolutionLayer3x3Kernel::border_size() const return _border_size; } -void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) +void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); @@ -114,6 +115,33 @@ void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, con build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset)); build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + + if(act_info.enabled()) + { + const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP); + const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP); + const int o1 = input->info()->quantization_info().offset; + + build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation()))); + build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val)); + build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val)); + build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1)); + + if(output != nullptr) + { + const float s1 = input->info()->quantization_info().scale; + const float s2 = output->info()->quantization_info().scale; + const int o2 = output->info()->quantization_info().offset; + + if(o1 != o2 || s1 != s2) + { + build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1)); + build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2)); + build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1)); + build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2)); + } + } + } } // Configure the local work size for Bifrost with a value obtained diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index 88e9376a9e..112af60f35 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -39,13 +39,13 @@ CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3() { } -void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) +void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); _kernel.set_target(CLScheduler::get().target()); - _kernel.configure(input, weights, biases, output, conv_info); + _kernel.configure(input, weights, biases, output, conv_info, act_info); // Configure border handler PixelValue &&zero_value(0.f); -- cgit v1.2.1