aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2018-02-16 15:17:23 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:49:16 +0000
commit99ac60bca77e9977c844cc1293751d63ddc3065c (patch)
tree30867264809a61c817e5c543a6ac997620f4beaa
parent60954c671ffdc3422bbdb728fc022eb6896c1e17 (diff)
downloadComputeLibrary-99ac60bca77e9977c844cc1293751d63ddc3065c.tar.gz
COMPMID-853 Fuse CL DepthwiseConvolution with Activation for QASYM8
Change-Id: I287908f76af458ad4b4d865d353dc37e33877250 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120839 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h3
-rw-r--r--src/core/CL/cl_kernels/activation_layer.cl6
-rw-r--r--src/core/CL/cl_kernels/activation_layer_qa8.cl34
-rw-r--r--src/core/CL/cl_kernels/batchnormalization_layer.cl13
-rw-r--r--src/core/CL/cl_kernels/depthwise_convolution_quantized.cl13
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp30
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp4
9 files changed, 80 insertions, 28 deletions
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h
index fb51781ecf..84bc09d1fb 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h
@@ -53,8 +53,9 @@ public:
* Data type supported: Same as @p input.
* @param[out] output Destination tensor. Data type supported: Same as @p input.
* @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
*/
- void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, ActivationLayerInfo act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index c348e3a09c..eb12fe4cca 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -58,8 +58,9 @@ public:
* Data type supported: Same as @p input.
* @param[out] output Destination tensor. Data type supported: same as @p input.
* @param[in] conv_info Padding and stride information to use for the convolution.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
*/
- void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, ActivationLayerInfo act_info = ActivationLayerInfo());
// Inherited methods overriden:
void run() override;
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index 4424a66b61..a8ea7387d6 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -115,6 +115,8 @@ inline TYPE linear_op(TYPE x)
#define ACTIVATION_OP2(op, x) op##_op(x)
#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#if defined(ACT)
+
/** This performs an activation function floating point inputs.
*
* @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
@@ -168,3 +170,5 @@ __kernel void activation_layer(
VSTORE(VEC_SIZE)
(data, 0, (__global DATA_TYPE *)output.ptr);
}
+
+#endif /* defined(ACT) */ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
index cb31e99efb..66e54ed6ad 100644
--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl
@@ -44,6 +44,26 @@ inline TYPE lu_brelu_op(TYPE x)
#define ACTIVATION_OP2(op, x) op##_op(x)
#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
+#define PERFORM_ACTIVATION_QA8(act, data) \
+ ({ \
+ data = ACTIVATION_OP(act, data); \
+ \
+ VEC_DATA_TYPE(float, VEC_SIZE) \
+ fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)); \
+ \
+ fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
+ data = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE)); \
+ })
+#else /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */
+#define PERFORM_ACTIVATION_QA8(act, data) \
+ ({ \
+ data = ACTIVATION_OP(act, data); \
+ })
+#endif /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */
+
+#if defined(ACT)
+
/** This performs an activation function on QASYMM8 inputs.
*
* @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
@@ -92,19 +112,11 @@ __kernel void activation_layer_qa8(
// Load data
TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
- // Perform activation
- data = ACTIVATION_OP(ACT, data);
-
-#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
- // requantize to output space
- VEC_DATA_TYPE(float, VEC_SIZE)
- fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));
-
- fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL);
- data = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE));
-#endif // defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
+ data = PERFORM_ACTIVATION_QA8(ACT, data);
// Store result
VSTORE(VEC_SIZE)
(data, 0, (__global DATA_TYPE *)output.ptr);
}
+
+#endif /* defined(ACT) */ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 5ddeb1a6a1..0b61b5638c 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -44,15 +44,12 @@
#endif /* FIXED_POINT_POSITION */
-#if defined(LU_BRELU)
-#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL)
-#elif defined(BRELU)
-#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)0, (DATA_TYPE)A_VAL)
-#elif defined(RELU)
-#define ACTIVATION_FUNC(x) max(x, (DATA_TYPE)0)
-#else /* FUSED_ACT */
+#if defined(FUSED_ACTIVATION)
+#include "activation_layer.cl"
+#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
#define ACTIVATION_FUNC(x) (x)
-#endif /* FUSED_ACT */
+#endif /* defined(FUSED_ACTIVATION) */
/** Apply batch normalization.
*
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 40538a156d..21daee8230 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -26,6 +26,15 @@
#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+#if defined(FUSED_ACTIVATION)
+#define DATA_TYPE uchar
+#define VEC_SIZE 8
+#include "activation_layer_qa8.cl"
+#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(FUSED_ACTIVATION) */
+
#if CONV_STRIDE_X > 3
#error "Stride X not supported"
#endif /* CONV_STRIDE_X > 3 */
@@ -222,7 +231,7 @@ __kernel void depthwise_convolution_3x3_quantized(
res0 = max(res0, (uchar8)0);
res0 = min(res0, (uchar8)255);
- vstore8(res0, 0, dst.ptr);
+ vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
#if CONV_STRIDE_Y == 1
values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
@@ -231,7 +240,7 @@ __kernel void depthwise_convolution_3x3_quantized(
res1 = max(res1, (uchar8)0);
res1 = min(res1, (uchar8)255);
- vstore8(res1, 0, dst.ptr + dst_stride_y);
+ vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
#endif /* CONV_STRIDE_Y == 1 */
}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 87fc1d097c..95c8250ee7 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -136,7 +136,7 @@ void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *out
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
- build_opts.add_option_if(act_info.enabled(), "-D" + string_from_activation_func(act_info.activation()));
+ build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
index 7a47bcc704..d50e4d695e 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -49,7 +49,8 @@ BorderSize CLDepthwiseConvolutionLayer3x3Kernel::border_size() const
return _border_size;
}
-void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -114,6 +115,33 @@ void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, con
build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+
+ if(act_info.enabled())
+ {
+ const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+ const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+ const int o1 = input->info()->quantization_info().offset;
+
+ build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
+ build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
+ build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
+ build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
+
+ if(output != nullptr)
+ {
+ const float s1 = input->info()->quantization_info().scale;
+ const float s2 = output->info()->quantization_info().scale;
+ const int o2 = output->info()->quantization_info().offset;
+
+ if(o1 != o2 || s1 != s2)
+ {
+ build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+ build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
+ build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
+ build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
+ }
+ }
+ }
}
// Configure the local work size for Bifrost with a value obtained
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 88e9376a9e..112af60f35 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -39,13 +39,13 @@ CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
{
}
-void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
_kernel.set_target(CLScheduler::get().target());
- _kernel.configure(input, weights, biases, output, conv_info);
+ _kernel.configure(input, weights, biases, output, conv_info, act_info);
// Configure border handler
PixelValue &&zero_value(0.f);